Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 42810 $
4  * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_str.h"
41 
42 
43 #if KMP_OS_WINDOWS || KMP_OS_LINUX
44 
45 //
46 // Print the affinity mask to the character array in a pretty format.
47 //
48 char *
49 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
50 {
51  KMP_ASSERT(buf_len >= 40);
52  char *scan = buf;
53  char *end = buf + buf_len - 1;
54 
55  //
56  // Find first element / check for empty set.
57  //
58  size_t i;
59  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
60  if (KMP_CPU_ISSET(i, mask)) {
61  break;
62  }
63  }
64  if (i == KMP_CPU_SETSIZE) {
65  sprintf(scan, "{<empty>}");
66  while (*scan != '\0') scan++;
67  KMP_ASSERT(scan <= end);
68  return buf;
69  }
70 
71  sprintf(scan, "{%ld", i);
72  while (*scan != '\0') scan++;
73  i++;
74  for (; i < KMP_CPU_SETSIZE; i++) {
75  if (! KMP_CPU_ISSET(i, mask)) {
76  continue;
77  }
78 
79  //
80  // Check for buffer overflow. A string of the form ",<n>" will have
81  // at most 10 characters, plus we want to leave room to print ",...}"
82  // if the set is too large to print for a total of 15 characters.
83  // We already left room for '\0' in setting end.
84  //
85  if (end - scan < 15) {
86  break;
87  }
88  sprintf(scan, ",%-ld", i);
89  while (*scan != '\0') scan++;
90  }
91  if (i < KMP_CPU_SETSIZE) {
92  sprintf(scan, ",...");
93  while (*scan != '\0') scan++;
94  }
95  sprintf(scan, "}");
96  while (*scan != '\0') scan++;
97  KMP_ASSERT(scan <= end);
98  return buf;
99 }
100 
101 
102 void
103 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
104 {
105  KMP_CPU_ZERO(mask);
106 
107 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
108 
109  if (__kmp_num_proc_groups > 1) {
110  int group;
111  struct GROUP_AFFINITY ga;
112  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
113  for (group = 0; group < __kmp_num_proc_groups; group++) {
114  int i;
115  int num = __kmp_GetActiveProcessorCount(group);
116  for (i = 0; i < num; i++) {
117  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
118  }
119  }
120  }
121  else
122 
123 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
124 
125  {
126  int proc;
127  for (proc = 0; proc < __kmp_xproc; proc++) {
128  KMP_CPU_SET(proc, mask);
129  }
130  }
131 }
132 
133 
134 //
135 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
136 // functions.
137 //
138 // The icc codegen emits sections with extremely long names, of the form
139 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
140 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
141 // some sort of memory corruption or table overflow that is triggered by
142 // these long strings. I checked the latest version of the linker -
143 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
144 // fixed.
145 //
146 // Unfortunately, my attempts to reproduce it in a smaller example have
147 // failed - I'm not sure what the prospects are of getting it fixed
148 // properly - but we need a reproducer smaller than all of libiomp.
149 //
150 // Work around the problem by avoiding inline constructors in such builds.
151 // We do this for all platforms, not just Linux* OS - non-inline functions are
152 // more debuggable and provide better coverage into than inline functions.
153 // Use inline functions in shipping libs, for performance.
154 //
155 
156 # if !defined(KMP_DEBUG) && !defined(COVER)
157 
158 class Address {
159 public:
160  static const unsigned maxDepth = 32;
161  unsigned labels[maxDepth];
162  unsigned childNums[maxDepth];
163  unsigned depth;
164  unsigned leader;
165  Address(unsigned _depth)
166  : depth(_depth), leader(FALSE) {
167  }
168  Address &operator=(const Address &b) {
169  depth = b.depth;
170  for (unsigned i = 0; i < depth; i++) {
171  labels[i] = b.labels[i];
172  childNums[i] = b.childNums[i];
173  }
174  leader = FALSE;
175  return *this;
176  }
177  bool operator==(const Address &b) const {
178  if (depth != b.depth)
179  return false;
180  for (unsigned i = 0; i < depth; i++)
181  if(labels[i] != b.labels[i])
182  return false;
183  return true;
184  }
185  bool isClose(const Address &b, int level) const {
186  if (depth != b.depth)
187  return false;
188  if ((unsigned)level >= depth)
189  return true;
190  for (unsigned i = 0; i < (depth - level); i++)
191  if(labels[i] != b.labels[i])
192  return false;
193  return true;
194  }
195  bool operator!=(const Address &b) const {
196  return !operator==(b);
197  }
198 };
199 
200 class AddrUnsPair {
201 public:
202  Address first;
203  unsigned second;
204  AddrUnsPair(Address _first, unsigned _second)
205  : first(_first), second(_second) {
206  }
207  AddrUnsPair &operator=(const AddrUnsPair &b)
208  {
209  first = b.first;
210  second = b.second;
211  return *this;
212  }
213 };
214 
215 # else
216 
217 class Address {
218 public:
219  static const unsigned maxDepth = 32;
220  unsigned labels[maxDepth];
221  unsigned childNums[maxDepth];
222  unsigned depth;
223  unsigned leader;
224  Address(unsigned _depth);
225  Address &operator=(const Address &b);
226  bool operator==(const Address &b) const;
227  bool isClose(const Address &b, int level) const;
228  bool operator!=(const Address &b) const;
229 };
230 
231 Address::Address(unsigned _depth)
232 {
233  depth = _depth;
234  leader = FALSE;
235 }
236 
237 Address &Address::operator=(const Address &b) {
238  depth = b.depth;
239  for (unsigned i = 0; i < depth; i++) {
240  labels[i] = b.labels[i];
241  childNums[i] = b.childNums[i];
242  }
243  leader = FALSE;
244  return *this;
245 }
246 
247 bool Address::operator==(const Address &b) const {
248  if (depth != b.depth)
249  return false;
250  for (unsigned i = 0; i < depth; i++)
251  if(labels[i] != b.labels[i])
252  return false;
253  return true;
254 }
255 
256 bool Address::isClose(const Address &b, int level) const {
257  if (depth != b.depth)
258  return false;
259  if ((unsigned)level >= depth)
260  return true;
261  for (unsigned i = 0; i < (depth - level); i++)
262  if(labels[i] != b.labels[i])
263  return false;
264  return true;
265 }
266 
267 bool Address::operator!=(const Address &b) const {
268  return !operator==(b);
269 }
270 
271 class AddrUnsPair {
272 public:
273  Address first;
274  unsigned second;
275  AddrUnsPair(Address _first, unsigned _second);
276  AddrUnsPair &operator=(const AddrUnsPair &b);
277 };
278 
279 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
280  : first(_first), second(_second)
281 {
282 }
283 
284 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
285 {
286  first = b.first;
287  second = b.second;
288  return *this;
289 }
290 
291 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
292 
293 
294 static int
295 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
296 {
297  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
298  ->first);
299  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
300  ->first);
301  unsigned depth = aa->depth;
302  unsigned i;
303  KMP_DEBUG_ASSERT(depth == bb->depth);
304  for (i = 0; i < depth; i++) {
305  if (aa->labels[i] < bb->labels[i]) return -1;
306  if (aa->labels[i] > bb->labels[i]) return 1;
307  }
308  return 0;
309 }
310 
311 
312 static int
313 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
314 {
315  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
316  ->first);
317  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
318  ->first);
319  unsigned depth = aa->depth;
320  unsigned i;
321  KMP_DEBUG_ASSERT(depth == bb->depth);
322  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
323  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
324  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
325  int j = depth - i - 1;
326  if (aa->childNums[j] < bb->childNums[j]) return -1;
327  if (aa->childNums[j] > bb->childNums[j]) return 1;
328  }
329  for (; i < depth; i++) {
330  int j = i - __kmp_affinity_compact;
331  if (aa->childNums[j] < bb->childNums[j]) return -1;
332  if (aa->childNums[j] > bb->childNums[j]) return 1;
333  }
334  return 0;
335 }
336 
337 
338 //
339 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
340 // called to renumber the labels from [0..n] and place them into the child_num
341 // vector of the address object. This is done in case the labels used for
342 // the children at one node of the heirarchy differ from those used for
343 // another node at the same level. Example: suppose the machine has 2 nodes
344 // with 2 packages each. The first node contains packages 601 and 602, and
345 // second node contains packages 603 and 604. If we try to sort the table
346 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
347 // because we are paying attention to the labels themselves, not the ordinal
348 // child numbers. By using the child numbers in the sort, the result is
349 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
350 //
351 static void
352 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
353  int numAddrs)
354 {
355  KMP_DEBUG_ASSERT(numAddrs > 0);
356  int depth = address2os->first.depth;
357  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
358  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
359  * sizeof(unsigned));
360  int labCt;
361  for (labCt = 0; labCt < depth; labCt++) {
362  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
363  lastLabel[labCt] = address2os[0].first.labels[labCt];
364  }
365  int i;
366  for (i = 1; i < numAddrs; i++) {
367  for (labCt = 0; labCt < depth; labCt++) {
368  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
369  int labCt2;
370  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
371  counts[labCt2] = 0;
372  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
373  }
374  counts[labCt]++;
375  lastLabel[labCt] = address2os[i].first.labels[labCt];
376  break;
377  }
378  }
379  for (labCt = 0; labCt < depth; labCt++) {
380  address2os[i].first.childNums[labCt] = counts[labCt];
381  }
382  for (; labCt < (int)Address::maxDepth; labCt++) {
383  address2os[i].first.childNums[labCt] = 0;
384  }
385  }
386 }
387 
388 
389 //
390 // All of the __kmp_affinity_create_*_map() routines should set
391 // __kmp_affinity_masks to a vector of affinity mask objects of length
392 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
393 // return the number of levels in the machine topology tree (zero if
394 // __kmp_affinity_type == affinity_none).
395 //
396 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
397 // to the affinity mask for the initialization thread. They need to save and
398 // restore the mask, and it could be needed later, so saving it is just an
399 // optimization to avoid calling kmp_get_system_affinity() again.
400 //
401 static kmp_affin_mask_t *fullMask = NULL;
402 
403 kmp_affin_mask_t *
404 __kmp_affinity_get_fullMask() { return fullMask; }
405 
406 
407 static int nCoresPerPkg, nPackages;
408 int __kmp_nThreadsPerCore;
409 
410 //
411 // __kmp_affinity_uniform_topology() doesn't work when called from
412 // places which support arbitrarily many levels in the machine topology
413 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
414 // __kmp_affinity_create_x2apicid_map().
415 //
416 inline static bool
417 __kmp_affinity_uniform_topology()
418 {
419  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
420 }
421 
422 
423 //
424 // Print out the detailed machine topology map, i.e. the physical locations
425 // of each OS proc.
426 //
427 static void
428 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
429  int pkgLevel, int coreLevel, int threadLevel)
430 {
431  int proc;
432 
433  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
434  for (proc = 0; proc < len; proc++) {
435  int level;
436  kmp_str_buf_t buf;
437  __kmp_str_buf_init(&buf);
438  for (level = 0; level < depth; level++) {
439  if (level == threadLevel) {
440  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
441  }
442  else if (level == coreLevel) {
443  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
444  }
445  else if (level == pkgLevel) {
446  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
447  }
448  else if (level > pkgLevel) {
449  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
450  level - pkgLevel - 1);
451  }
452  else {
453  __kmp_str_buf_print(&buf, "L%d ", level);
454  }
455  __kmp_str_buf_print(&buf, "%d ",
456  address2os[proc].first.labels[level]);
457  }
458  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
459  buf.str);
460  __kmp_str_buf_free(&buf);
461  }
462 }
463 
464 
465 //
466 // If we don't know how to retrieve the machine's processor topology, or
467 // encounter an error in doing so, this routine is called to form a "flat"
468 // mapping of os thread id's <-> processor id's.
469 //
470 static int
471 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
472  kmp_i18n_id_t *const msg_id)
473 {
474  *address2os = NULL;
475  *msg_id = kmp_i18n_null;
476 
477  //
478  // Even if __kmp_affinity_type == affinity_none, this routine might still
479  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
480  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
481  //
482  if (! KMP_AFFINITY_CAPABLE()) {
483  KMP_ASSERT(__kmp_affinity_type == affinity_none);
484  __kmp_ncores = nPackages = __kmp_xproc;
485  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
486  __kmp_ht_enabled = FALSE;
487  if (__kmp_affinity_verbose) {
488  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
489  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
490  KMP_INFORM(Uniform, "KMP_AFFINITY");
491  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
492  __kmp_nThreadsPerCore, __kmp_ncores);
493  }
494  return 0;
495  }
496 
497  //
498  // When affinity is off, this routine will still be called to set
499  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
500  // nCoresPerPkg, & nPackages. Make sure all these vars are set
501  // correctly, and return now if affinity is not enabled.
502  //
503  __kmp_ncores = nPackages = __kmp_avail_proc;
504  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
505  __kmp_ht_enabled = FALSE;
506  if (__kmp_affinity_verbose) {
507  char buf[KMP_AFFIN_MASK_PRINT_LEN];
508  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
509 
510  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
511  if (__kmp_affinity_respect_mask) {
512  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
513  } else {
514  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
515  }
516  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
517  KMP_INFORM(Uniform, "KMP_AFFINITY");
518  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
519  __kmp_nThreadsPerCore, __kmp_ncores);
520  }
521  if (__kmp_affinity_type == affinity_none) {
522  return 0;
523  }
524 
525  //
526  // Contruct the data structure to be returned.
527  //
528  *address2os = (AddrUnsPair*)
529  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
530  int avail_ct = 0;
531  unsigned int i;
532  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
533  //
534  // Skip this proc if it is not included in the machine model.
535  //
536  if (! KMP_CPU_ISSET(i, fullMask)) {
537  continue;
538  }
539 
540  Address addr(1);
541  addr.labels[0] = i;
542  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
543  }
544  if (__kmp_affinity_verbose) {
545  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
546  }
547 
548  if (__kmp_affinity_gran_levels < 0) {
549  //
550  // Only the package level is modeled in the machine topology map,
551  // so the #levels of granularity is either 0 or 1.
552  //
553  if (__kmp_affinity_gran > affinity_gran_package) {
554  __kmp_affinity_gran_levels = 1;
555  }
556  else {
557  __kmp_affinity_gran_levels = 0;
558  }
559  }
560  return 1;
561 }
562 
563 
564 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
565 
566 //
567 // If multiple Windows* OS processor groups exist, we can create a 2-level
568 // topology map with the groups at level 0 and the individual procs at
569 // level 1.
570 //
571 // This facilitates letting the threads float among all procs in a group,
572 // if granularity=group (the default when there are multiple groups).
573 //
574 static int
575 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
576  kmp_i18n_id_t *const msg_id)
577 {
578  *address2os = NULL;
579  *msg_id = kmp_i18n_null;
580 
581  //
582  // If we don't have multiple processor groups, return now.
583  // The flat mapping will be used.
584  //
585  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
586  // FIXME set *msg_id
587  return -1;
588  }
589 
590  //
591  // Contruct the data structure to be returned.
592  //
593  *address2os = (AddrUnsPair*)
594  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
595  int avail_ct = 0;
596  int i;
597  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
598  //
599  // Skip this proc if it is not included in the machine model.
600  //
601  if (! KMP_CPU_ISSET(i, fullMask)) {
602  continue;
603  }
604 
605  Address addr(2);
606  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
607  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
608  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
609 
610  if (__kmp_affinity_verbose) {
611  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
612  addr.labels[1]);
613  }
614  }
615 
616  if (__kmp_affinity_gran_levels < 0) {
617  if (__kmp_affinity_gran == affinity_gran_group) {
618  __kmp_affinity_gran_levels = 1;
619  }
620  else if ((__kmp_affinity_gran == affinity_gran_fine)
621  || (__kmp_affinity_gran == affinity_gran_thread)) {
622  __kmp_affinity_gran_levels = 0;
623  }
624  else {
625  const char *gran_str = NULL;
626  if (__kmp_affinity_gran == affinity_gran_core) {
627  gran_str = "core";
628  }
629  else if (__kmp_affinity_gran == affinity_gran_package) {
630  gran_str = "package";
631  }
632  else if (__kmp_affinity_gran == affinity_gran_node) {
633  gran_str = "node";
634  }
635  else {
636  KMP_ASSERT(0);
637  }
638 
639  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
640  __kmp_affinity_gran_levels = 0;
641  }
642  }
643  return 2;
644 }
645 
646 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
647 
648 
649 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
650 
651 static int
652 __kmp_cpuid_mask_width(int count) {
653  int r = 0;
654 
655  while((1<<r) < count)
656  ++r;
657  return r;
658 }
659 
660 
661 class apicThreadInfo {
662 public:
663  unsigned osId; // param to __kmp_affinity_bind_thread
664  unsigned apicId; // from cpuid after binding
665  unsigned maxCoresPerPkg; // ""
666  unsigned maxThreadsPerPkg; // ""
667  unsigned pkgId; // inferred from above values
668  unsigned coreId; // ""
669  unsigned threadId; // ""
670 };
671 
672 
673 static int
674 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
675 {
676  const apicThreadInfo *aa = (const apicThreadInfo *)a;
677  const apicThreadInfo *bb = (const apicThreadInfo *)b;
678  if (aa->osId < bb->osId) return -1;
679  if (aa->osId > bb->osId) return 1;
680  return 0;
681 }
682 
683 
684 static int
685 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
686 {
687  const apicThreadInfo *aa = (const apicThreadInfo *)a;
688  const apicThreadInfo *bb = (const apicThreadInfo *)b;
689  if (aa->pkgId < bb->pkgId) return -1;
690  if (aa->pkgId > bb->pkgId) return 1;
691  if (aa->coreId < bb->coreId) return -1;
692  if (aa->coreId > bb->coreId) return 1;
693  if (aa->threadId < bb->threadId) return -1;
694  if (aa->threadId > bb->threadId) return 1;
695  return 0;
696 }
697 
698 
699 //
700 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
701 // an algorithm which cycles through the available os threads, setting
702 // the current thread's affinity mask to that thread, and then retrieves
703 // the Apic Id for each thread context using the cpuid instruction.
704 //
705 static int
706 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
707  kmp_i18n_id_t *const msg_id)
708 {
709  int rc;
710  *address2os = NULL;
711  *msg_id = kmp_i18n_null;
712 
713 # if KMP_MIC
714  {
715  // The code below will use cpuid(4).
716  // Check if cpuid(4) is supported.
717  // FIXME? - this really doesn't need to be specific to MIC.
718  kmp_cpuid buf;
719  __kmp_x86_cpuid(0, 0, &buf);
720  if (buf.eax < 4) {
721  *msg_id = kmp_i18n_str_NoLeaf4Support;
722  return -1;
723  }
724  }
725 # endif // KMP_MIC
726 
727  //
728  // Even if __kmp_affinity_type == affinity_none, this routine is still
729  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
730  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
731  //
732  // The algorithm used starts by setting the affinity to each available
733  // thread and retreiving info from the cpuid instruction, so if we are not
734  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
735  // then we need to do something else.
736  //
737  if (! KMP_AFFINITY_CAPABLE()) {
738  //
739  // Hack to try and infer the machine topology using only the data
740  // available from cpuid on the current thread, and __kmp_xproc.
741  //
742  KMP_ASSERT(__kmp_affinity_type == affinity_none);
743 
744  //
745  // Get an upper bound on the number of threads per package using
746  // cpuid(1).
747  //
748  // On some OS/chps combinations where HT is supported by the chip
749  // but is disabled, this value will be 2 on a single core chip.
750  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
751  //
752  kmp_cpuid buf;
753  __kmp_x86_cpuid(1, 0, &buf);
754  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
755  if (maxThreadsPerPkg == 0) {
756  maxThreadsPerPkg = 1;
757  }
758 
759  //
760  // The num cores per pkg comes from cpuid(4).
761  // 1 must be added to the encoded value.
762  //
763  // The author of cpu_count.cpp treated this only an upper bound
764  // on the number of cores, but I haven't seen any cases where it
765  // was greater than the actual number of cores, so we will treat
766  // it as exact in this block of code.
767  //
768  // First, we need to check if cpuid(4) is supported on this chip.
769  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
770  // has the value n or greater.
771  //
772  __kmp_x86_cpuid(0, 0, &buf);
773  if (buf.eax >= 4) {
774  __kmp_x86_cpuid(4, 0, &buf);
775  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
776  }
777  else {
778  nCoresPerPkg = 1;
779  }
780 
781  //
782  // There is no way to reliably tell if HT is enabled without issuing
783  // the cpuid instruction from every thread, can correlating the cpuid
784  // info, so if the machine is not affinity capable, we assume that HT
785  // is off. We have seen quite a few machines where maxThreadsPerPkg
786  // is 2, yet the machine does not support HT.
787  //
788  // - Older OSes are usually found on machines with older chips, which
789  // do not support HT.
790  //
791  // - The performance penalty for mistakenly identifying a machine as
792  // HT when it isn't (which results in blocktime being incorrecly set
793  // to 0) is greater than the penalty when for mistakenly identifying
794  // a machine as being 1 thread/core when it is really HT enabled
795  // (which results in blocktime being incorrectly set to a positive
796  // value).
797  //
798  __kmp_ncores = __kmp_xproc;
799  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
800  __kmp_nThreadsPerCore = 1;
801  __kmp_ht_enabled = FALSE;
802  if (__kmp_affinity_verbose) {
803  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
804  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
805  if (__kmp_affinity_uniform_topology()) {
806  KMP_INFORM(Uniform, "KMP_AFFINITY");
807  } else {
808  KMP_INFORM(NonUniform, "KMP_AFFINITY");
809  }
810  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
811  __kmp_nThreadsPerCore, __kmp_ncores);
812  }
813  return 0;
814  }
815 
816  //
817  //
818  // From here on, we can assume that it is safe to call
819  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
820  // even if __kmp_affinity_type = affinity_none.
821  //
822 
823  //
824  // Save the affinity mask for the current thread.
825  //
826  kmp_affin_mask_t *oldMask;
827  KMP_CPU_ALLOC(oldMask);
828  KMP_ASSERT(oldMask != NULL);
829  __kmp_get_system_affinity(oldMask, TRUE);
830 
831  //
832  // Run through each of the available contexts, binding the current thread
833  // to it, and obtaining the pertinent information using the cpuid instr.
834  //
835  // The relevant information is:
836  //
837  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
838  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
839  //
840  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
841  // value of this field determines the width of the core# + thread#
842  // fields in the Apic Id. It is also an upper bound on the number
843  // of threads per package, but it has been verified that situations
844  // happen were it is not exact. In particular, on certain OS/chip
845  // combinations where Intel(R) Hyper-Threading Technology is supported
846  // by the chip but has
847  // been disabled, the value of this field will be 2 (for a single core
848  // chip). On other OS/chip combinations supporting
849  // Intel(R) Hyper-Threading Technology, the value of
850  // this field will be 1 when Intel(R) Hyper-Threading Technology is
851  // disabled and 2 when it is enabled.
852  //
853  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
854  // value of this field (+1) determines the width of the core# field in
855  // the Apic Id. The comments in "cpucount.cpp" say that this value is
856  // an upper bound, but the IA-32 architecture manual says that it is
857  // exactly the number of cores per package, and I haven't seen any
858  // case where it wasn't.
859  //
860  // From this information, deduce the package Id, core Id, and thread Id,
861  // and set the corresponding fields in the apicThreadInfo struct.
862  //
863  unsigned i;
864  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
865  __kmp_avail_proc * sizeof(apicThreadInfo));
866  unsigned nApics = 0;
867  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
868  //
869  // Skip this proc if it is not included in the machine model.
870  //
871  if (! KMP_CPU_ISSET(i, fullMask)) {
872  continue;
873  }
874  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
875 
876  __kmp_affinity_bind_thread(i);
877  threadInfo[nApics].osId = i;
878 
879  //
880  // The apic id and max threads per pkg come from cpuid(1).
881  //
882  kmp_cpuid buf;
883  __kmp_x86_cpuid(1, 0, &buf);
884  if (! (buf.edx >> 9) & 1) {
885  __kmp_set_system_affinity(oldMask, TRUE);
886  __kmp_free(threadInfo);
887  KMP_CPU_FREE(oldMask);
888  *msg_id = kmp_i18n_str_ApicNotPresent;
889  return -1;
890  }
891  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
892  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
893  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
894  threadInfo[nApics].maxThreadsPerPkg = 1;
895  }
896 
897  //
898  // Max cores per pkg comes from cpuid(4).
899  // 1 must be added to the encoded value.
900  //
901  // First, we need to check if cpuid(4) is supported on this chip.
902  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
903  // has the value n or greater.
904  //
905  __kmp_x86_cpuid(0, 0, &buf);
906  if (buf.eax >= 4) {
907  __kmp_x86_cpuid(4, 0, &buf);
908  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
909  }
910  else {
911  threadInfo[nApics].maxCoresPerPkg = 1;
912  }
913 
914  //
915  // Infer the pkgId / coreId / threadId using only the info
916  // obtained locally.
917  //
918  int widthCT = __kmp_cpuid_mask_width(
919  threadInfo[nApics].maxThreadsPerPkg);
920  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
921 
922  int widthC = __kmp_cpuid_mask_width(
923  threadInfo[nApics].maxCoresPerPkg);
924  int widthT = widthCT - widthC;
925  if (widthT < 0) {
926  //
927  // I've never seen this one happen, but I suppose it could, if
928  // the cpuid instruction on a chip was really screwed up.
929  // Make sure to restore the affinity mask before the tail call.
930  //
931  __kmp_set_system_affinity(oldMask, TRUE);
932  __kmp_free(threadInfo);
933  KMP_CPU_FREE(oldMask);
934  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
935  return -1;
936  }
937 
938  int maskC = (1 << widthC) - 1;
939  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
940  &maskC;
941 
942  int maskT = (1 << widthT) - 1;
943  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
944 
945  nApics++;
946  }
947 
948  //
949  // We've collected all the info we need.
950  // Restore the old affinity mask for this thread.
951  //
952  __kmp_set_system_affinity(oldMask, TRUE);
953 
954  //
955  // If there's only one thread context to bind to, form an Address object
956  // with depth 1 and return immediately (or, if affinity is off, set
957  // address2os to NULL and return).
958  //
959  // If it is configured to omit the package level when there is only a
960  // single package, the logic at the end of this routine won't work if
961  // there is only a single thread - it would try to form an Address
962  // object with depth 0.
963  //
964  KMP_ASSERT(nApics > 0);
965  if (nApics == 1) {
966  __kmp_ncores = nPackages = 1;
967  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
968  __kmp_ht_enabled = FALSE;
969  if (__kmp_affinity_verbose) {
970  char buf[KMP_AFFIN_MASK_PRINT_LEN];
971  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
972 
973  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
974  if (__kmp_affinity_respect_mask) {
975  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
976  } else {
977  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
978  }
979  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
980  KMP_INFORM(Uniform, "KMP_AFFINITY");
981  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
982  __kmp_nThreadsPerCore, __kmp_ncores);
983  }
984 
985  if (__kmp_affinity_type == affinity_none) {
986  __kmp_free(threadInfo);
987  KMP_CPU_FREE(oldMask);
988  return 0;
989  }
990 
991  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
992  Address addr(1);
993  addr.labels[0] = threadInfo[0].pkgId;
994  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
995 
996  if (__kmp_affinity_gran_levels < 0) {
997  __kmp_affinity_gran_levels = 0;
998  }
999 
1000  if (__kmp_affinity_verbose) {
1001  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1002  }
1003 
1004  __kmp_free(threadInfo);
1005  KMP_CPU_FREE(oldMask);
1006  return 1;
1007  }
1008 
1009  //
1010  // Sort the threadInfo table by physical Id.
1011  //
1012  qsort(threadInfo, nApics, sizeof(*threadInfo),
1013  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1014 
1015  //
1016  // The table is now sorted by pkgId / coreId / threadId, but we really
1017  // don't know the radix of any of the fields. pkgId's may be sparsely
1018  // assigned among the chips on a system. Although coreId's are usually
1019  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1020  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1021  //
1022  // For that matter, we don't know what coresPerPkg and threadsPerCore
1023  // (or the total # packages) are at this point - we want to determine
1024  // that now. We only have an upper bound on the first two figures.
1025  //
1026  // We also perform a consistency check at this point: the values returned
1027  // by the cpuid instruction for any thread bound to a given package had
1028  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1029  //
1030  nPackages = 1;
1031  nCoresPerPkg = 1;
1032  __kmp_nThreadsPerCore = 1;
1033  unsigned nCores = 1;
1034 
1035  unsigned pkgCt = 1; // to determine radii
1036  unsigned lastPkgId = threadInfo[0].pkgId;
1037  unsigned coreCt = 1;
1038  unsigned lastCoreId = threadInfo[0].coreId;
1039  unsigned threadCt = 1;
1040  unsigned lastThreadId = threadInfo[0].threadId;
1041 
1042  // intra-pkg consist checks
1043  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1044  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1045 
1046  for (i = 1; i < nApics; i++) {
1047  if (threadInfo[i].pkgId != lastPkgId) {
1048  nCores++;
1049  pkgCt++;
1050  lastPkgId = threadInfo[i].pkgId;
1051  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1052  coreCt = 1;
1053  lastCoreId = threadInfo[i].coreId;
1054  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1055  threadCt = 1;
1056  lastThreadId = threadInfo[i].threadId;
1057 
1058  //
1059  // This is a different package, so go on to the next iteration
1060  // without doing any consistency checks. Reset the consistency
1061  // check vars, though.
1062  //
1063  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1064  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1065  continue;
1066  }
1067 
1068  if (threadInfo[i].coreId != lastCoreId) {
1069  nCores++;
1070  coreCt++;
1071  lastCoreId = threadInfo[i].coreId;
1072  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1073  threadCt = 1;
1074  lastThreadId = threadInfo[i].threadId;
1075  }
1076  else if (threadInfo[i].threadId != lastThreadId) {
1077  threadCt++;
1078  lastThreadId = threadInfo[i].threadId;
1079  }
1080  else {
1081  __kmp_free(threadInfo);
1082  KMP_CPU_FREE(oldMask);
1083  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1084  return -1;
1085  }
1086 
1087  //
1088  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1089  // fields agree between all the threads bounds to a given package.
1090  //
1091  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1092  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1093  __kmp_free(threadInfo);
1094  KMP_CPU_FREE(oldMask);
1095  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1096  return -1;
1097  }
1098  }
1099  nPackages = pkgCt;
1100  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1101  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1102 
1103  //
1104  // When affinity is off, this routine will still be called to set
1105  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1106  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1107  // correctly, and return now if affinity is not enabled.
1108  //
1109  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1110  __kmp_ncores = nCores;
1111  if (__kmp_affinity_verbose) {
1112  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1113  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1114 
1115  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1116  if (__kmp_affinity_respect_mask) {
1117  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1118  } else {
1119  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1120  }
1121  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1122  if (__kmp_affinity_uniform_topology()) {
1123  KMP_INFORM(Uniform, "KMP_AFFINITY");
1124  } else {
1125  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1126  }
1127  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1128  __kmp_nThreadsPerCore, __kmp_ncores);
1129 
1130  }
1131 
1132  if (__kmp_affinity_type == affinity_none) {
1133  __kmp_free(threadInfo);
1134  KMP_CPU_FREE(oldMask);
1135  return 0;
1136  }
1137 
1138  //
1139  // Now that we've determined the number of packages, the number of cores
1140  // per package, and the number of threads per core, we can construct the
1141  // data structure that is to be returned.
1142  //
1143  int pkgLevel = 0;
1144  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1145  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1146  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1147 
1148  KMP_ASSERT(depth > 0);
1149  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1150 
1151  for (i = 0; i < nApics; ++i) {
1152  Address addr(depth);
1153  unsigned os = threadInfo[i].osId;
1154  int d = 0;
1155 
1156  if (pkgLevel >= 0) {
1157  addr.labels[d++] = threadInfo[i].pkgId;
1158  }
1159  if (coreLevel >= 0) {
1160  addr.labels[d++] = threadInfo[i].coreId;
1161  }
1162  if (threadLevel >= 0) {
1163  addr.labels[d++] = threadInfo[i].threadId;
1164  }
1165  (*address2os)[i] = AddrUnsPair(addr, os);
1166  }
1167 
1168  if (__kmp_affinity_gran_levels < 0) {
1169  //
1170  // Set the granularity level based on what levels are modeled
1171  // in the machine topology map.
1172  //
1173  __kmp_affinity_gran_levels = 0;
1174  if ((threadLevel >= 0)
1175  && (__kmp_affinity_gran > affinity_gran_thread)) {
1176  __kmp_affinity_gran_levels++;
1177  }
1178  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1179  __kmp_affinity_gran_levels++;
1180  }
1181  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1182  __kmp_affinity_gran_levels++;
1183  }
1184  }
1185 
1186  if (__kmp_affinity_verbose) {
1187  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1188  coreLevel, threadLevel);
1189  }
1190 
1191  __kmp_free(threadInfo);
1192  KMP_CPU_FREE(oldMask);
1193  return depth;
1194 }
1195 
1196 
1197 //
1198 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1199 // architectures support a newer interface for specifying the x2APIC Ids,
1200 // based on cpuid leaf 11.
1201 //
1202 static int
1203 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1204  kmp_i18n_id_t *const msg_id)
1205 {
1206  kmp_cpuid buf;
1207 
1208  *address2os = NULL;
1209  *msg_id = kmp_i18n_null;
1210 
1211  //
1212  // Check to see if cpuid leaf 11 is supported.
1213  //
1214  __kmp_x86_cpuid(0, 0, &buf);
1215  if (buf.eax < 11) {
1216  *msg_id = kmp_i18n_str_NoLeaf11Support;
1217  return -1;
1218  }
1219  __kmp_x86_cpuid(11, 0, &buf);
1220  if (buf.ebx == 0) {
1221  *msg_id = kmp_i18n_str_NoLeaf11Support;
1222  return -1;
1223  }
1224 
1225  //
1226  // Find the number of levels in the machine topology. While we're at it,
1227  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1228  // try to get more accurate values later by explicitly counting them,
1229  // but get reasonable defaults now, in case we return early.
1230  //
1231  int level;
1232  int threadLevel = -1;
1233  int coreLevel = -1;
1234  int pkgLevel = -1;
1235  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1236 
1237  for (level = 0;; level++) {
1238  if (level > 31) {
1239  //
1240  // FIXME: Hack for DPD200163180
1241  //
1242  // If level is big then something went wrong -> exiting
1243  //
1244  // There could actually be 32 valid levels in the machine topology,
1245  // but so far, the only machine we have seen which does not exit
1246  // this loop before iteration 32 has fubar x2APIC settings.
1247  //
1248  // For now, just reject this case based upon loop trip count.
1249  //
1250  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1251  return -1;
1252  }
1253  __kmp_x86_cpuid(11, level, &buf);
1254  if (buf.ebx == 0) {
1255  if (pkgLevel < 0) {
1256  //
1257  // Will infer nPackages from __kmp_xproc
1258  //
1259  pkgLevel = level;
1260  level++;
1261  }
1262  break;
1263  }
1264  int kind = (buf.ecx >> 8) & 0xff;
1265  if (kind == 1) {
1266  //
1267  // SMT level
1268  //
1269  threadLevel = level;
1270  coreLevel = -1;
1271  pkgLevel = -1;
1272  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1273  if (__kmp_nThreadsPerCore == 0) {
1274  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1275  return -1;
1276  }
1277  }
1278  else if (kind == 2) {
1279  //
1280  // core level
1281  //
1282  coreLevel = level;
1283  pkgLevel = -1;
1284  nCoresPerPkg = buf.ebx & 0xff;
1285  if (nCoresPerPkg == 0) {
1286  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1287  return -1;
1288  }
1289  }
1290  else {
1291  if (level <= 0) {
1292  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1293  return -1;
1294  }
1295  if (pkgLevel >= 0) {
1296  continue;
1297  }
1298  pkgLevel = level;
1299  nPackages = buf.ebx & 0xff;
1300  if (nPackages == 0) {
1301  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1302  return -1;
1303  }
1304  }
1305  }
1306  int depth = level;
1307 
1308  //
1309  // In the above loop, "level" was counted from the finest level (usually
1310  // thread) to the coarsest. The caller expects that we will place the
1311  // labels in (*address2os)[].first.labels[] in the inverse order, so
1312  // we need to invert the vars saying which level means what.
1313  //
1314  if (threadLevel >= 0) {
1315  threadLevel = depth - threadLevel - 1;
1316  }
1317  if (coreLevel >= 0) {
1318  coreLevel = depth - coreLevel - 1;
1319  }
1320  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1321  pkgLevel = depth - pkgLevel - 1;
1322 
1323  //
1324  // The algorithm used starts by setting the affinity to each available
1325  // thread and retrieving info from the cpuid instruction, so if we are not
1326  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1327  // then we need to do something else - use the defaults that we calculated
1328  // from issuing cpuid without binding to each proc.
1329  //
1330  if (! KMP_AFFINITY_CAPABLE())
1331  {
1332  //
1333  // Hack to try and infer the machine topology using only the data
1334  // available from cpuid on the current thread, and __kmp_xproc.
1335  //
1336  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1337 
1338  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1339  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1340  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1341  if (__kmp_affinity_verbose) {
1342  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1343  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1344  if (__kmp_affinity_uniform_topology()) {
1345  KMP_INFORM(Uniform, "KMP_AFFINITY");
1346  } else {
1347  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1348  }
1349  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1350  __kmp_nThreadsPerCore, __kmp_ncores);
1351  }
1352  return 0;
1353  }
1354 
1355  //
1356  //
1357  // From here on, we can assume that it is safe to call
1358  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1359  // even if __kmp_affinity_type = affinity_none.
1360  //
1361 
1362  //
1363  // Save the affinity mask for the current thread.
1364  //
1365  kmp_affin_mask_t *oldMask;
1366  KMP_CPU_ALLOC(oldMask);
1367  __kmp_get_system_affinity(oldMask, TRUE);
1368 
1369  //
1370  // Allocate the data structure to be returned.
1371  //
1372  AddrUnsPair *retval = (AddrUnsPair *)
1373  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1374 
1375  //
1376  // Run through each of the available contexts, binding the current thread
1377  // to it, and obtaining the pertinent information using the cpuid instr.
1378  //
1379  unsigned int proc;
1380  int nApics = 0;
1381  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1382  //
1383  // Skip this proc if it is not included in the machine model.
1384  //
1385  if (! KMP_CPU_ISSET(proc, fullMask)) {
1386  continue;
1387  }
1388  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1389 
1390  __kmp_affinity_bind_thread(proc);
1391 
1392  //
1393  // Extrach the labels for each level in the machine topology map
1394  // from the Apic ID.
1395  //
1396  Address addr(depth);
1397  int prev_shift = 0;
1398 
1399  for (level = 0; level < depth; level++) {
1400  __kmp_x86_cpuid(11, level, &buf);
1401  unsigned apicId = buf.edx;
1402  if (buf.ebx == 0) {
1403  if (level != depth - 1) {
1404  KMP_CPU_FREE(oldMask);
1405  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1406  return -1;
1407  }
1408  addr.labels[depth - level - 1] = apicId >> prev_shift;
1409  level++;
1410  break;
1411  }
1412  int shift = buf.eax & 0x1f;
1413  int mask = (1 << shift) - 1;
1414  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1415  prev_shift = shift;
1416  }
1417  if (level != depth) {
1418  KMP_CPU_FREE(oldMask);
1419  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1420  return -1;
1421  }
1422 
1423  retval[nApics] = AddrUnsPair(addr, proc);
1424  nApics++;
1425  }
1426 
1427  //
1428  // We've collected all the info we need.
1429  // Restore the old affinity mask for this thread.
1430  //
1431  __kmp_set_system_affinity(oldMask, TRUE);
1432 
1433  //
1434  // If there's only one thread context to bind to, return now.
1435  //
1436  KMP_ASSERT(nApics > 0);
1437  if (nApics == 1) {
1438  __kmp_ncores = nPackages = 1;
1439  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1440  __kmp_ht_enabled = FALSE;
1441  if (__kmp_affinity_verbose) {
1442  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1443  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1444 
1445  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1446  if (__kmp_affinity_respect_mask) {
1447  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1448  } else {
1449  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1450  }
1451  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1452  KMP_INFORM(Uniform, "KMP_AFFINITY");
1453  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1454  __kmp_nThreadsPerCore, __kmp_ncores);
1455  }
1456 
1457  if (__kmp_affinity_type == affinity_none) {
1458  __kmp_free(retval);
1459  KMP_CPU_FREE(oldMask);
1460  return 0;
1461  }
1462 
1463  //
1464  // Form an Address object which only includes the package level.
1465  //
1466  Address addr(1);
1467  addr.labels[0] = retval[0].first.labels[pkgLevel];
1468  retval[0].first = addr;
1469 
1470  if (__kmp_affinity_gran_levels < 0) {
1471  __kmp_affinity_gran_levels = 0;
1472  }
1473 
1474  if (__kmp_affinity_verbose) {
1475  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1476  }
1477 
1478  *address2os = retval;
1479  KMP_CPU_FREE(oldMask);
1480  return 1;
1481  }
1482 
1483  //
1484  // Sort the table by physical Id.
1485  //
1486  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1487 
1488  //
1489  // Find the radix at each of the levels.
1490  //
1491  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1492  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1493  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1494  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1495  for (level = 0; level < depth; level++) {
1496  totals[level] = 1;
1497  maxCt[level] = 1;
1498  counts[level] = 1;
1499  last[level] = retval[0].first.labels[level];
1500  }
1501 
1502  //
1503  // From here on, the iteration variable "level" runs from the finest
1504  // level to the coarsest, i.e. we iterate forward through
1505  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1506  // backwards.
1507  //
1508  for (proc = 1; (int)proc < nApics; proc++) {
1509  int level;
1510  for (level = 0; level < depth; level++) {
1511  if (retval[proc].first.labels[level] != last[level]) {
1512  int j;
1513  for (j = level + 1; j < depth; j++) {
1514  totals[j]++;
1515  counts[j] = 1;
1516  // The line below causes printing incorrect topology information
1517  // in case the max value for some level (maxCt[level]) is encountered earlier than
1518  // some less value while going through the array.
1519  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1520  // whereas it must be 4.
1521  // TODO!!! Check if it can be commented safely
1522  //maxCt[j] = 1;
1523  last[j] = retval[proc].first.labels[j];
1524  }
1525  totals[level]++;
1526  counts[level]++;
1527  if (counts[level] > maxCt[level]) {
1528  maxCt[level] = counts[level];
1529  }
1530  last[level] = retval[proc].first.labels[level];
1531  break;
1532  }
1533  else if (level == depth - 1) {
1534  __kmp_free(last);
1535  __kmp_free(maxCt);
1536  __kmp_free(counts);
1537  __kmp_free(totals);
1538  __kmp_free(retval);
1539  KMP_CPU_FREE(oldMask);
1540  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1541  return -1;
1542  }
1543  }
1544  }
1545 
1546  //
1547  // When affinity is off, this routine will still be called to set
1548  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1549  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1550  // correctly, and return if affinity is not enabled.
1551  //
1552  if (threadLevel >= 0) {
1553  __kmp_nThreadsPerCore = maxCt[threadLevel];
1554  }
1555  else {
1556  __kmp_nThreadsPerCore = 1;
1557  }
1558  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1559 
1560  nPackages = totals[pkgLevel];
1561 
1562  if (coreLevel >= 0) {
1563  __kmp_ncores = totals[coreLevel];
1564  nCoresPerPkg = maxCt[coreLevel];
1565  }
1566  else {
1567  __kmp_ncores = nPackages;
1568  nCoresPerPkg = 1;
1569  }
1570 
1571  //
1572  // Check to see if the machine topology is uniform
1573  //
1574  unsigned prod = maxCt[0];
1575  for (level = 1; level < depth; level++) {
1576  prod *= maxCt[level];
1577  }
1578  bool uniform = (prod == totals[level - 1]);
1579 
1580  //
1581  // Print the machine topology summary.
1582  //
1583  if (__kmp_affinity_verbose) {
1584  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1585  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1586 
1587  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1588  if (__kmp_affinity_respect_mask) {
1589  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1590  } else {
1591  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1592  }
1593  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1594  if (uniform) {
1595  KMP_INFORM(Uniform, "KMP_AFFINITY");
1596  } else {
1597  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1598  }
1599 
1600  kmp_str_buf_t buf;
1601  __kmp_str_buf_init(&buf);
1602 
1603  __kmp_str_buf_print(&buf, "%d", totals[0]);
1604  for (level = 1; level <= pkgLevel; level++) {
1605  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1606  }
1607  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1608  __kmp_nThreadsPerCore, __kmp_ncores);
1609 
1610  __kmp_str_buf_free(&buf);
1611  }
1612 
1613  if (__kmp_affinity_type == affinity_none) {
1614  __kmp_free(last);
1615  __kmp_free(maxCt);
1616  __kmp_free(counts);
1617  __kmp_free(totals);
1618  __kmp_free(retval);
1619  KMP_CPU_FREE(oldMask);
1620  return 0;
1621  }
1622 
1623  //
1624  // Find any levels with radiix 1, and remove them from the map
1625  // (except for the package level).
1626  //
1627  int new_depth = 0;
1628  for (level = 0; level < depth; level++) {
1629  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1630  continue;
1631  }
1632  new_depth++;
1633  }
1634 
1635  //
1636  // If we are removing any levels, allocate a new vector to return,
1637  // and copy the relevant information to it.
1638  //
1639  if (new_depth != depth) {
1640  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1641  sizeof(AddrUnsPair) * nApics);
1642  for (proc = 0; (int)proc < nApics; proc++) {
1643  Address addr(new_depth);
1644  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1645  }
1646  int new_level = 0;
1647  for (level = 0; level < depth; level++) {
1648  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1649  if (level == threadLevel) {
1650  threadLevel = -1;
1651  }
1652  else if ((threadLevel >= 0) && (level < threadLevel)) {
1653  threadLevel--;
1654  }
1655  if (level == coreLevel) {
1656  coreLevel = -1;
1657  }
1658  else if ((coreLevel >= 0) && (level < coreLevel)) {
1659  coreLevel--;
1660  }
1661  if (level < pkgLevel) {
1662  pkgLevel--;
1663  }
1664  continue;
1665  }
1666  for (proc = 0; (int)proc < nApics; proc++) {
1667  new_retval[proc].first.labels[new_level]
1668  = retval[proc].first.labels[level];
1669  }
1670  new_level++;
1671  }
1672 
1673  __kmp_free(retval);
1674  retval = new_retval;
1675  depth = new_depth;
1676  }
1677 
1678  if (__kmp_affinity_gran_levels < 0) {
1679  //
1680  // Set the granularity level based on what levels are modeled
1681  // in the machine topology map.
1682  //
1683  __kmp_affinity_gran_levels = 0;
1684  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1685  __kmp_affinity_gran_levels++;
1686  }
1687  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1688  __kmp_affinity_gran_levels++;
1689  }
1690  if (__kmp_affinity_gran > affinity_gran_package) {
1691  __kmp_affinity_gran_levels++;
1692  }
1693  }
1694 
1695  if (__kmp_affinity_verbose) {
1696  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1697  coreLevel, threadLevel);
1698  }
1699 
1700  __kmp_free(last);
1701  __kmp_free(maxCt);
1702  __kmp_free(counts);
1703  __kmp_free(totals);
1704  KMP_CPU_FREE(oldMask);
1705  *address2os = retval;
1706  return depth;
1707 }
1708 
1709 
1710 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1711 
1712 
1713 #define osIdIndex 0
1714 #define threadIdIndex 1
1715 #define coreIdIndex 2
1716 #define pkgIdIndex 3
1717 #define nodeIdIndex 4
1718 
1719 typedef unsigned *ProcCpuInfo;
1720 static unsigned maxIndex = pkgIdIndex;
1721 
1722 
1723 static int
1724 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1725 {
1726  const unsigned *aa = (const unsigned *)a;
1727  const unsigned *bb = (const unsigned *)b;
1728  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1729  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1730  return 0;
1731 };
1732 
1733 
1734 static int
1735 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1736 {
1737  unsigned i;
1738  const unsigned *aa = *((const unsigned **)a);
1739  const unsigned *bb = *((const unsigned **)b);
1740  for (i = maxIndex; ; i--) {
1741  if (aa[i] < bb[i]) return -1;
1742  if (aa[i] > bb[i]) return 1;
1743  if (i == osIdIndex) break;
1744  }
1745  return 0;
1746 }
1747 
1748 
1749 //
1750 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1751 // affinity map.
1752 //
1753 static int
1754 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1755  kmp_i18n_id_t *const msg_id, FILE *f)
1756 {
1757  *address2os = NULL;
1758  *msg_id = kmp_i18n_null;
1759 
1760  //
1761  // Scan of the file, and count the number of "processor" (osId) fields,
1762  // and find the higest value of <n> for a node_<n> field.
1763  //
1764  char buf[256];
1765  unsigned num_records = 0;
1766  while (! feof(f)) {
1767  buf[sizeof(buf) - 1] = 1;
1768  if (! fgets(buf, sizeof(buf), f)) {
1769  //
1770  // Read errors presumably because of EOF
1771  //
1772  break;
1773  }
1774 
1775  char s1[] = "processor";
1776  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1777  num_records++;
1778  continue;
1779  }
1780 
1781  //
1782  // FIXME - this will match "node_<n> <garbage>"
1783  //
1784  unsigned level;
1785  if (sscanf(buf, "node_%d id", &level) == 1) {
1786  if (nodeIdIndex + level >= maxIndex) {
1787  maxIndex = nodeIdIndex + level;
1788  }
1789  continue;
1790  }
1791  }
1792 
1793  //
1794  // Check for empty file / no valid processor records, or too many.
1795  // The number of records can't exceed the number of valid bits in the
1796  // affinity mask.
1797  //
1798  if (num_records == 0) {
1799  *line = 0;
1800  *msg_id = kmp_i18n_str_NoProcRecords;
1801  return -1;
1802  }
1803  if (num_records > (unsigned)__kmp_xproc) {
1804  *line = 0;
1805  *msg_id = kmp_i18n_str_TooManyProcRecords;
1806  return -1;
1807  }
1808 
1809  //
1810  // Set the file pointer back to the begginning, so that we can scan the
1811  // file again, this time performing a full parse of the data.
1812  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1813  // Adding an extra element at the end allows us to remove a lot of extra
1814  // checks for termination conditions.
1815  //
1816  if (fseek(f, 0, SEEK_SET) != 0) {
1817  *line = 0;
1818  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1819  return -1;
1820  }
1821 
1822  //
1823  // Allocate the array of records to store the proc info in. The dummy
1824  // element at the end makes the logic in filling them out easier to code.
1825  //
1826  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1827  * sizeof(unsigned *));
1828  unsigned i;
1829  for (i = 0; i <= num_records; i++) {
1830  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1831  * sizeof(unsigned));
1832  }
1833 
1834 #define CLEANUP_THREAD_INFO \
1835  for (i = 0; i <= num_records; i++) { \
1836  __kmp_free(threadInfo[i]); \
1837  } \
1838  __kmp_free(threadInfo);
1839 
1840  //
1841  // A value of UINT_MAX means that we didn't find the field
1842  //
1843  unsigned __index;
1844 
1845 #define INIT_PROC_INFO(p) \
1846  for (__index = 0; __index <= maxIndex; __index++) { \
1847  (p)[__index] = UINT_MAX; \
1848  }
1849 
1850  for (i = 0; i <= num_records; i++) {
1851  INIT_PROC_INFO(threadInfo[i]);
1852  }
1853 
1854  unsigned num_avail = 0;
1855  *line = 0;
1856  while (! feof(f)) {
1857  //
1858  // Create an inner scoping level, so that all the goto targets at the
1859  // end of the loop appear in an outer scoping level. This avoids
1860  // warnings about jumping past an initialization to a target in the
1861  // same block.
1862  //
1863  {
1864  buf[sizeof(buf) - 1] = 1;
1865  bool long_line = false;
1866  if (! fgets(buf, sizeof(buf), f)) {
1867  //
1868  // Read errors presumably because of EOF
1869  //
1870  // If there is valid data in threadInfo[num_avail], then fake
1871  // a blank line in ensure that the last address gets parsed.
1872  //
1873  bool valid = false;
1874  for (i = 0; i <= maxIndex; i++) {
1875  if (threadInfo[num_avail][i] != UINT_MAX) {
1876  valid = true;
1877  }
1878  }
1879  if (! valid) {
1880  break;
1881  }
1882  buf[0] = 0;
1883  } else if (!buf[sizeof(buf) - 1]) {
1884  //
1885  // The line is longer than the buffer. Set a flag and don't
1886  // emit an error if we were going to ignore the line, anyway.
1887  //
1888  long_line = true;
1889 
1890 #define CHECK_LINE \
1891  if (long_line) { \
1892  CLEANUP_THREAD_INFO; \
1893  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1894  return -1; \
1895  }
1896  }
1897  (*line)++;
1898 
1899  char s1[] = "processor";
1900  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1901  CHECK_LINE;
1902  char *p = strchr(buf + sizeof(s1) - 1, ':');
1903  unsigned val;
1904  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1905  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1906  threadInfo[num_avail][osIdIndex] = val;
1907 #if KMP_OS_LINUX && USE_SYSFS_INFO
1908  char path[256];
1909  snprintf(path, sizeof(path),
1910  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1911  threadInfo[num_avail][osIdIndex]);
1912  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1913 
1914  snprintf(path, sizeof(path),
1915  "/sys/devices/system/cpu/cpu%u/topology/core_id",
1916  threadInfo[num_avail][osIdIndex]);
1917  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1918  continue;
1919 #else
1920  }
1921  char s2[] = "physical id";
1922  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1923  CHECK_LINE;
1924  char *p = strchr(buf + sizeof(s2) - 1, ':');
1925  unsigned val;
1926  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1927  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1928  threadInfo[num_avail][pkgIdIndex] = val;
1929  continue;
1930  }
1931  char s3[] = "core id";
1932  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1933  CHECK_LINE;
1934  char *p = strchr(buf + sizeof(s3) - 1, ':');
1935  unsigned val;
1936  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1937  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1938  threadInfo[num_avail][coreIdIndex] = val;
1939  continue;
1940 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
1941  }
1942  char s4[] = "thread id";
1943  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1944  CHECK_LINE;
1945  char *p = strchr(buf + sizeof(s4) - 1, ':');
1946  unsigned val;
1947  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1948  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1949  threadInfo[num_avail][threadIdIndex] = val;
1950  continue;
1951  }
1952  unsigned level;
1953  if (sscanf(buf, "node_%d id", &level) == 1) {
1954  CHECK_LINE;
1955  char *p = strchr(buf + sizeof(s4) - 1, ':');
1956  unsigned val;
1957  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1958  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1959  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1960  threadInfo[num_avail][nodeIdIndex + level] = val;
1961  continue;
1962  }
1963 
1964  //
1965  // We didn't recognize the leading token on the line.
1966  // There are lots of leading tokens that we don't recognize -
1967  // if the line isn't empty, go on to the next line.
1968  //
1969  if ((*buf != 0) && (*buf != '\n')) {
1970  //
1971  // If the line is longer than the buffer, read characters
1972  // until we find a newline.
1973  //
1974  if (long_line) {
1975  int ch;
1976  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1977  }
1978  continue;
1979  }
1980 
1981  //
1982  // A newline has signalled the end of the processor record.
1983  // Check that there aren't too many procs specified.
1984  //
1985  if (num_avail == __kmp_xproc) {
1986  CLEANUP_THREAD_INFO;
1987  *msg_id = kmp_i18n_str_TooManyEntries;
1988  return -1;
1989  }
1990 
1991  //
1992  // Check for missing fields. The osId field must be there, and we
1993  // currently require that the physical id field is specified, also.
1994  //
1995  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1996  CLEANUP_THREAD_INFO;
1997  *msg_id = kmp_i18n_str_MissingProcField;
1998  return -1;
1999  }
2000  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2001  CLEANUP_THREAD_INFO;
2002  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2003  return -1;
2004  }
2005 
2006  //
2007  // Skip this proc if it is not included in the machine model.
2008  //
2009  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2010  INIT_PROC_INFO(threadInfo[num_avail]);
2011  continue;
2012  }
2013 
2014  //
2015  // We have a successful parse of this proc's info.
2016  // Increment the counter, and prepare for the next proc.
2017  //
2018  num_avail++;
2019  KMP_ASSERT(num_avail <= num_records);
2020  INIT_PROC_INFO(threadInfo[num_avail]);
2021  }
2022  continue;
2023 
2024  no_val:
2025  CLEANUP_THREAD_INFO;
2026  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2027  return -1;
2028 
2029  dup_field:
2030  CLEANUP_THREAD_INFO;
2031  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2032  return -1;
2033  }
2034  *line = 0;
2035 
2036 # if KMP_MIC && REDUCE_TEAM_SIZE
2037  unsigned teamSize = 0;
2038 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2039 
2040  // check for num_records == __kmp_xproc ???
2041 
2042  //
2043  // If there's only one thread context to bind to, form an Address object
2044  // with depth 1 and return immediately (or, if affinity is off, set
2045  // address2os to NULL and return).
2046  //
2047  // If it is configured to omit the package level when there is only a
2048  // single package, the logic at the end of this routine won't work if
2049  // there is only a single thread - it would try to form an Address
2050  // object with depth 0.
2051  //
2052  KMP_ASSERT(num_avail > 0);
2053  KMP_ASSERT(num_avail <= num_records);
2054  if (num_avail == 1) {
2055  __kmp_ncores = 1;
2056  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2057  __kmp_ht_enabled = FALSE;
2058  if (__kmp_affinity_verbose) {
2059  if (! KMP_AFFINITY_CAPABLE()) {
2060  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2061  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2062  KMP_INFORM(Uniform, "KMP_AFFINITY");
2063  }
2064  else {
2065  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2066  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2067  fullMask);
2068  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2069  if (__kmp_affinity_respect_mask) {
2070  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2071  } else {
2072  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2073  }
2074  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2075  KMP_INFORM(Uniform, "KMP_AFFINITY");
2076  }
2077  int index;
2078  kmp_str_buf_t buf;
2079  __kmp_str_buf_init(&buf);
2080  __kmp_str_buf_print(&buf, "1");
2081  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2082  __kmp_str_buf_print(&buf, " x 1");
2083  }
2084  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2085  __kmp_str_buf_free(&buf);
2086  }
2087 
2088  if (__kmp_affinity_type == affinity_none) {
2089  CLEANUP_THREAD_INFO;
2090  return 0;
2091  }
2092 
2093  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2094  Address addr(1);
2095  addr.labels[0] = threadInfo[0][pkgIdIndex];
2096  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2097 
2098  if (__kmp_affinity_gran_levels < 0) {
2099  __kmp_affinity_gran_levels = 0;
2100  }
2101 
2102  if (__kmp_affinity_verbose) {
2103  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2104  }
2105 
2106  CLEANUP_THREAD_INFO;
2107  return 1;
2108  }
2109 
2110  //
2111  // Sort the threadInfo table by physical Id.
2112  //
2113  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2114  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2115 
2116  //
2117  // The table is now sorted by pkgId / coreId / threadId, but we really
2118  // don't know the radix of any of the fields. pkgId's may be sparsely
2119  // assigned among the chips on a system. Although coreId's are usually
2120  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2121  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2122  //
2123  // For that matter, we don't know what coresPerPkg and threadsPerCore
2124  // (or the total # packages) are at this point - we want to determine
2125  // that now. We only have an upper bound on the first two figures.
2126  //
2127  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2128  * sizeof(unsigned));
2129  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2130  * sizeof(unsigned));
2131  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2132  * sizeof(unsigned));
2133  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2134  * sizeof(unsigned));
2135 
2136  bool assign_thread_ids = false;
2137  unsigned threadIdCt;
2138  unsigned index;
2139 
2140  restart_radix_check:
2141  threadIdCt = 0;
2142 
2143  //
2144  // Initialize the counter arrays with data from threadInfo[0].
2145  //
2146  if (assign_thread_ids) {
2147  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2148  threadInfo[0][threadIdIndex] = threadIdCt++;
2149  }
2150  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2151  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2152  }
2153  }
2154  for (index = 0; index <= maxIndex; index++) {
2155  counts[index] = 1;
2156  maxCt[index] = 1;
2157  totals[index] = 1;
2158  lastId[index] = threadInfo[0][index];;
2159  }
2160 
2161  //
2162  // Run through the rest of the OS procs.
2163  //
2164  for (i = 1; i < num_avail; i++) {
2165  //
2166  // Find the most significant index whose id differs
2167  // from the id for the previous OS proc.
2168  //
2169  for (index = maxIndex; index >= threadIdIndex; index--) {
2170  if (assign_thread_ids && (index == threadIdIndex)) {
2171  //
2172  // Auto-assign the thread id field if it wasn't specified.
2173  //
2174  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2175  threadInfo[i][threadIdIndex] = threadIdCt++;
2176  }
2177 
2178  //
2179  // Aparrently the thread id field was specified for some
2180  // entries and not others. Start the thread id counter
2181  // off at the next higher thread id.
2182  //
2183  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2184  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2185  }
2186  }
2187  if (threadInfo[i][index] != lastId[index]) {
2188  //
2189  // Run through all indices which are less significant,
2190  // and reset the counts to 1.
2191  //
2192  // At all levels up to and including index, we need to
2193  // increment the totals and record the last id.
2194  //
2195  unsigned index2;
2196  for (index2 = threadIdIndex; index2 < index; index2++) {
2197  totals[index2]++;
2198  if (counts[index2] > maxCt[index2]) {
2199  maxCt[index2] = counts[index2];
2200  }
2201  counts[index2] = 1;
2202  lastId[index2] = threadInfo[i][index2];
2203  }
2204  counts[index]++;
2205  totals[index]++;
2206  lastId[index] = threadInfo[i][index];
2207 
2208  if (assign_thread_ids && (index > threadIdIndex)) {
2209 
2210 # if KMP_MIC && REDUCE_TEAM_SIZE
2211  //
2212  // The default team size is the total #threads in the machine
2213  // minus 1 thread for every core that has 3 or more threads.
2214  //
2215  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2216 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2217 
2218  //
2219  // Restart the thread counter, as we are on a new core.
2220  //
2221  threadIdCt = 0;
2222 
2223  //
2224  // Auto-assign the thread id field if it wasn't specified.
2225  //
2226  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2227  threadInfo[i][threadIdIndex] = threadIdCt++;
2228  }
2229 
2230  //
2231  // Aparrently the thread id field was specified for some
2232  // entries and not others. Start the thread id counter
2233  // off at the next higher thread id.
2234  //
2235  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2236  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2237  }
2238  }
2239  break;
2240  }
2241  }
2242  if (index < threadIdIndex) {
2243  //
2244  // If thread ids were specified, it is an error if they are not
2245  // unique. Also, check that we waven't already restarted the
2246  // loop (to be safe - shouldn't need to).
2247  //
2248  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2249  || assign_thread_ids) {
2250  __kmp_free(lastId);
2251  __kmp_free(totals);
2252  __kmp_free(maxCt);
2253  __kmp_free(counts);
2254  CLEANUP_THREAD_INFO;
2255  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2256  return -1;
2257  }
2258 
2259  //
2260  // If the thread ids were not specified and we see entries
2261  // entries that are duplicates, start the loop over and
2262  // assign the thread ids manually.
2263  //
2264  assign_thread_ids = true;
2265  goto restart_radix_check;
2266  }
2267  }
2268 
2269 # if KMP_MIC && REDUCE_TEAM_SIZE
2270  //
2271  // The default team size is the total #threads in the machine
2272  // minus 1 thread for every core that has 3 or more threads.
2273  //
2274  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2275 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2276 
2277  for (index = threadIdIndex; index <= maxIndex; index++) {
2278  if (counts[index] > maxCt[index]) {
2279  maxCt[index] = counts[index];
2280  }
2281  }
2282 
2283  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2284  nCoresPerPkg = maxCt[coreIdIndex];
2285  nPackages = totals[pkgIdIndex];
2286 
2287  //
2288  // Check to see if the machine topology is uniform
2289  //
2290  unsigned prod = totals[maxIndex];
2291  for (index = threadIdIndex; index < maxIndex; index++) {
2292  prod *= maxCt[index];
2293  }
2294  bool uniform = (prod == totals[threadIdIndex]);
2295 
2296  //
2297  // When affinity is off, this routine will still be called to set
2298  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2299  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2300  // correctly, and return now if affinity is not enabled.
2301  //
2302  __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2303  __kmp_ncores = totals[coreIdIndex];
2304 
2305  if (__kmp_affinity_verbose) {
2306  if (! KMP_AFFINITY_CAPABLE()) {
2307  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2308  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2309  if (uniform) {
2310  KMP_INFORM(Uniform, "KMP_AFFINITY");
2311  } else {
2312  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2313  }
2314  }
2315  else {
2316  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2317  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2318  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2319  if (__kmp_affinity_respect_mask) {
2320  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2321  } else {
2322  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2323  }
2324  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2325  if (uniform) {
2326  KMP_INFORM(Uniform, "KMP_AFFINITY");
2327  } else {
2328  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2329  }
2330  }
2331  kmp_str_buf_t buf;
2332  __kmp_str_buf_init(&buf);
2333 
2334  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2335  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2336  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2337  }
2338  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2339  maxCt[threadIdIndex], __kmp_ncores);
2340 
2341  __kmp_str_buf_free(&buf);
2342  }
2343 
2344 # if KMP_MIC && REDUCE_TEAM_SIZE
2345  //
2346  // Set the default team size.
2347  //
2348  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2349  __kmp_dflt_team_nth = teamSize;
2350  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2351  __kmp_dflt_team_nth));
2352  }
2353 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2354 
2355  if (__kmp_affinity_type == affinity_none) {
2356  __kmp_free(lastId);
2357  __kmp_free(totals);
2358  __kmp_free(maxCt);
2359  __kmp_free(counts);
2360  CLEANUP_THREAD_INFO;
2361  return 0;
2362  }
2363 
2364  //
2365  // Count the number of levels which have more nodes at that level than
2366  // at the parent's level (with there being an implicit root node of
2367  // the top level). This is equivalent to saying that there is at least
2368  // one node at this level which has a sibling. These levels are in the
2369  // map, and the package level is always in the map.
2370  //
2371  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2372  int level = 0;
2373  for (index = threadIdIndex; index < maxIndex; index++) {
2374  KMP_ASSERT(totals[index] >= totals[index + 1]);
2375  inMap[index] = (totals[index] > totals[index + 1]);
2376  }
2377  inMap[maxIndex] = (totals[maxIndex] > 1);
2378  inMap[pkgIdIndex] = true;
2379 
2380  int depth = 0;
2381  for (index = threadIdIndex; index <= maxIndex; index++) {
2382  if (inMap[index]) {
2383  depth++;
2384  }
2385  }
2386  KMP_ASSERT(depth > 0);
2387 
2388  //
2389  // Construct the data structure that is to be returned.
2390  //
2391  *address2os = (AddrUnsPair*)
2392  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2393  int pkgLevel = -1;
2394  int coreLevel = -1;
2395  int threadLevel = -1;
2396 
2397  for (i = 0; i < num_avail; ++i) {
2398  Address addr(depth);
2399  unsigned os = threadInfo[i][osIdIndex];
2400  int src_index;
2401  int dst_index = 0;
2402 
2403  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2404  if (! inMap[src_index]) {
2405  continue;
2406  }
2407  addr.labels[dst_index] = threadInfo[i][src_index];
2408  if (src_index == pkgIdIndex) {
2409  pkgLevel = dst_index;
2410  }
2411  else if (src_index == coreIdIndex) {
2412  coreLevel = dst_index;
2413  }
2414  else if (src_index == threadIdIndex) {
2415  threadLevel = dst_index;
2416  }
2417  dst_index++;
2418  }
2419  (*address2os)[i] = AddrUnsPair(addr, os);
2420  }
2421 
2422  if (__kmp_affinity_gran_levels < 0) {
2423  //
2424  // Set the granularity level based on what levels are modeled
2425  // in the machine topology map.
2426  //
2427  unsigned src_index;
2428  __kmp_affinity_gran_levels = 0;
2429  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2430  if (! inMap[src_index]) {
2431  continue;
2432  }
2433  switch (src_index) {
2434  case threadIdIndex:
2435  if (__kmp_affinity_gran > affinity_gran_thread) {
2436  __kmp_affinity_gran_levels++;
2437  }
2438 
2439  break;
2440  case coreIdIndex:
2441  if (__kmp_affinity_gran > affinity_gran_core) {
2442  __kmp_affinity_gran_levels++;
2443  }
2444  break;
2445 
2446  case pkgIdIndex:
2447  if (__kmp_affinity_gran > affinity_gran_package) {
2448  __kmp_affinity_gran_levels++;
2449  }
2450  break;
2451  }
2452  }
2453  }
2454 
2455  if (__kmp_affinity_verbose) {
2456  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2457  coreLevel, threadLevel);
2458  }
2459 
2460  __kmp_free(inMap);
2461  __kmp_free(lastId);
2462  __kmp_free(totals);
2463  __kmp_free(maxCt);
2464  __kmp_free(counts);
2465  CLEANUP_THREAD_INFO;
2466  return depth;
2467 }
2468 
2469 
2470 //
2471 // Create and return a table of affinity masks, indexed by OS thread ID.
2472 // This routine handles OR'ing together all the affinity masks of threads
2473 // that are sufficiently close, if granularity > fine.
2474 //
2475 static kmp_affin_mask_t *
2476 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2477  AddrUnsPair *address2os, unsigned numAddrs)
2478 {
2479  //
2480  // First form a table of affinity masks in order of OS thread id.
2481  //
2482  unsigned depth;
2483  unsigned maxOsId;
2484  unsigned i;
2485 
2486  KMP_ASSERT(numAddrs > 0);
2487  depth = address2os[0].first.depth;
2488 
2489  maxOsId = 0;
2490  for (i = 0; i < numAddrs; i++) {
2491  unsigned osId = address2os[i].second;
2492  if (osId > maxOsId) {
2493  maxOsId = osId;
2494  }
2495  }
2496  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2497  (maxOsId + 1) * __kmp_affin_mask_size);
2498 
2499  //
2500  // Sort the address2os table according to physical order. Doing so
2501  // will put all threads on the same core/package/node in consecutive
2502  // locations.
2503  //
2504  qsort(address2os, numAddrs, sizeof(*address2os),
2505  __kmp_affinity_cmp_Address_labels);
2506 
2507  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2508  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2509  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2510  }
2511  if (__kmp_affinity_gran_levels >= (int)depth) {
2512  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2513  && (__kmp_affinity_type != affinity_none))) {
2514  KMP_WARNING(AffThreadsMayMigrate);
2515  }
2516  }
2517 
2518  //
2519  // Run through the table, forming the masks for all threads on each
2520  // core. Threads on the same core will have identical "Address"
2521  // objects, not considering the last level, which must be the thread
2522  // id. All threads on a core will appear consecutively.
2523  //
2524  unsigned unique = 0;
2525  unsigned j = 0; // index of 1st thread on core
2526  unsigned leader = 0;
2527  Address *leaderAddr = &(address2os[0].first);
2528  kmp_affin_mask_t *sum
2529  = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2530  KMP_CPU_ZERO(sum);
2531  KMP_CPU_SET(address2os[0].second, sum);
2532  for (i = 1; i < numAddrs; i++) {
2533  //
2534  // If this thread is sufficiently close to the leader (withing the
2535  // granularity setting), then set the bit for this os thread in the
2536  // affinity mask for this group, and go on to the next thread.
2537  //
2538  if (leaderAddr->isClose(address2os[i].first,
2539  __kmp_affinity_gran_levels)) {
2540  KMP_CPU_SET(address2os[i].second, sum);
2541  continue;
2542  }
2543 
2544  //
2545  // For every thread in this group, copy the mask to the thread's
2546  // entry in the osId2Mask table. Mark the first address as a
2547  // leader.
2548  //
2549  for (; j < i; j++) {
2550  unsigned osId = address2os[j].second;
2551  KMP_DEBUG_ASSERT(osId <= maxOsId);
2552  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2553  KMP_CPU_COPY(mask, sum);
2554  address2os[j].first.leader = (j == leader);
2555  }
2556  unique++;
2557 
2558  //
2559  // Start a new mask.
2560  //
2561  leader = i;
2562  leaderAddr = &(address2os[i].first);
2563  KMP_CPU_ZERO(sum);
2564  KMP_CPU_SET(address2os[i].second, sum);
2565  }
2566 
2567  //
2568  // For every thread in last group, copy the mask to the thread's
2569  // entry in the osId2Mask table.
2570  //
2571  for (; j < i; j++) {
2572  unsigned osId = address2os[j].second;
2573  KMP_DEBUG_ASSERT(osId <= maxOsId);
2574  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2575  KMP_CPU_COPY(mask, sum);
2576  address2os[j].first.leader = (j == leader);
2577  }
2578  unique++;
2579 
2580  *maxIndex = maxOsId;
2581  *numUnique = unique;
2582  return osId2Mask;
2583 }
2584 
2585 
2586 //
2587 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2588 // as file-static than to try and pass them through the calling sequence of
2589 // the recursive-descent OMP_PLACES parser.
2590 //
2591 static kmp_affin_mask_t *newMasks;
2592 static int numNewMasks;
2593 static int nextNewMask;
2594 
2595 #define ADD_MASK(_mask) \
2596  { \
2597  if (nextNewMask >= numNewMasks) { \
2598  numNewMasks *= 2; \
2599  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2600  numNewMasks * __kmp_affin_mask_size); \
2601  } \
2602  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2603  nextNewMask++; \
2604  }
2605 
2606 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2607  { \
2608  if (((_osId) > _maxOsId) || \
2609  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\
2610  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2611  && (__kmp_affinity_type != affinity_none))) { \
2612  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2613  } \
2614  } \
2615  else { \
2616  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2617  } \
2618  }
2619 
2620 
2621 //
2622 // Re-parse the proclist (for the explicit affinity type), and form the list
2623 // of affinity newMasks indexed by gtid.
2624 //
2625 static void
2626 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2627  unsigned int *out_numMasks, const char *proclist,
2628  kmp_affin_mask_t *osId2Mask, int maxOsId)
2629 {
2630  const char *scan = proclist;
2631  const char *next = proclist;
2632 
2633  //
2634  // We use malloc() for the temporary mask vector,
2635  // so that we can use realloc() to extend it.
2636  //
2637  numNewMasks = 2;
2638  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2639  * __kmp_affin_mask_size);
2640  nextNewMask = 0;
2641  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2642  __kmp_affin_mask_size);
2643  int setSize = 0;
2644 
2645  for (;;) {
2646  int start, end, stride;
2647 
2648  SKIP_WS(scan);
2649  next = scan;
2650  if (*next == '\0') {
2651  break;
2652  }
2653 
2654  if (*next == '{') {
2655  int num;
2656  setSize = 0;
2657  next++; // skip '{'
2658  SKIP_WS(next);
2659  scan = next;
2660 
2661  //
2662  // Read the first integer in the set.
2663  //
2664  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2665  "bad proclist");
2666  SKIP_DIGITS(next);
2667  num = __kmp_str_to_int(scan, *next);
2668  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2669 
2670  //
2671  // Copy the mask for that osId to the sum (union) mask.
2672  //
2673  if ((num > maxOsId) ||
2674  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2675  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2676  && (__kmp_affinity_type != affinity_none))) {
2677  KMP_WARNING(AffIgnoreInvalidProcID, num);
2678  }
2679  KMP_CPU_ZERO(sumMask);
2680  }
2681  else {
2682  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2683  setSize = 1;
2684  }
2685 
2686  for (;;) {
2687  //
2688  // Check for end of set.
2689  //
2690  SKIP_WS(next);
2691  if (*next == '}') {
2692  next++; // skip '}'
2693  break;
2694  }
2695 
2696  //
2697  // Skip optional comma.
2698  //
2699  if (*next == ',') {
2700  next++;
2701  }
2702  SKIP_WS(next);
2703 
2704  //
2705  // Read the next integer in the set.
2706  //
2707  scan = next;
2708  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2709  "bad explicit proc list");
2710 
2711  SKIP_DIGITS(next);
2712  num = __kmp_str_to_int(scan, *next);
2713  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2714 
2715  //
2716  // Add the mask for that osId to the sum mask.
2717  //
2718  if ((num > maxOsId) ||
2719  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2720  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2721  && (__kmp_affinity_type != affinity_none))) {
2722  KMP_WARNING(AffIgnoreInvalidProcID, num);
2723  }
2724  }
2725  else {
2726  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2727  setSize++;
2728  }
2729  }
2730  if (setSize > 0) {
2731  ADD_MASK(sumMask);
2732  }
2733 
2734  SKIP_WS(next);
2735  if (*next == ',') {
2736  next++;
2737  }
2738  scan = next;
2739  continue;
2740  }
2741 
2742  //
2743  // Read the first integer.
2744  //
2745  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2746  SKIP_DIGITS(next);
2747  start = __kmp_str_to_int(scan, *next);
2748  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2749  SKIP_WS(next);
2750 
2751  //
2752  // If this isn't a range, then add a mask to the list and go on.
2753  //
2754  if (*next != '-') {
2755  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2756 
2757  //
2758  // Skip optional comma.
2759  //
2760  if (*next == ',') {
2761  next++;
2762  }
2763  scan = next;
2764  continue;
2765  }
2766 
2767  //
2768  // This is a range. Skip over the '-' and read in the 2nd int.
2769  //
2770  next++; // skip '-'
2771  SKIP_WS(next);
2772  scan = next;
2773  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2774  SKIP_DIGITS(next);
2775  end = __kmp_str_to_int(scan, *next);
2776  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2777 
2778  //
2779  // Check for a stride parameter
2780  //
2781  stride = 1;
2782  SKIP_WS(next);
2783  if (*next == ':') {
2784  //
2785  // A stride is specified. Skip over the ':" and read the 3rd int.
2786  //
2787  int sign = +1;
2788  next++; // skip ':'
2789  SKIP_WS(next);
2790  scan = next;
2791  if (*next == '-') {
2792  sign = -1;
2793  next++;
2794  SKIP_WS(next);
2795  scan = next;
2796  }
2797  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2798  "bad explicit proc list");
2799  SKIP_DIGITS(next);
2800  stride = __kmp_str_to_int(scan, *next);
2801  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2802  stride *= sign;
2803  }
2804 
2805  //
2806  // Do some range checks.
2807  //
2808  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2809  if (stride > 0) {
2810  KMP_ASSERT2(start <= end, "bad explicit proc list");
2811  }
2812  else {
2813  KMP_ASSERT2(start >= end, "bad explicit proc list");
2814  }
2815  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2816 
2817  //
2818  // Add the mask for each OS proc # to the list.
2819  //
2820  if (stride > 0) {
2821  do {
2822  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2823  start += stride;
2824  } while (start <= end);
2825  }
2826  else {
2827  do {
2828  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2829  start += stride;
2830  } while (start >= end);
2831  }
2832 
2833  //
2834  // Skip optional comma.
2835  //
2836  SKIP_WS(next);
2837  if (*next == ',') {
2838  next++;
2839  }
2840  scan = next;
2841  }
2842 
2843  *out_numMasks = nextNewMask;
2844  if (nextNewMask == 0) {
2845  *out_masks = NULL;
2846  KMP_INTERNAL_FREE(newMasks);
2847  return;
2848  }
2849  *out_masks
2850  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2851  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2852  __kmp_free(sumMask);
2853  KMP_INTERNAL_FREE(newMasks);
2854 }
2855 
2856 
2857 # if OMP_40_ENABLED
2858 
2859 /*-----------------------------------------------------------------------------
2860 
2861 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2862 places. Again, Here is the grammar:
2863 
2864 place_list := place
2865 place_list := place , place_list
2866 place := num
2867 place := place : num
2868 place := place : num : signed
2869 place := { subplacelist }
2870 place := ! place // (lowest priority)
2871 subplace_list := subplace
2872 subplace_list := subplace , subplace_list
2873 subplace := num
2874 subplace := num : num
2875 subplace := num : num : signed
2876 signed := num
2877 signed := + signed
2878 signed := - signed
2879 
2880 -----------------------------------------------------------------------------*/
2881 
2882 static void
2883 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2884  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2885 {
2886  const char *next;
2887 
2888  for (;;) {
2889  int start, count, stride, i;
2890 
2891  //
2892  // Read in the starting proc id
2893  //
2894  SKIP_WS(*scan);
2895  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2896  "bad explicit places list");
2897  next = *scan;
2898  SKIP_DIGITS(next);
2899  start = __kmp_str_to_int(*scan, *next);
2900  KMP_ASSERT(start >= 0);
2901  *scan = next;
2902 
2903  //
2904  // valid follow sets are ',' ':' and '}'
2905  //
2906  SKIP_WS(*scan);
2907  if (**scan == '}' || **scan == ',') {
2908  if ((start > maxOsId) ||
2909  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2910  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2911  && (__kmp_affinity_type != affinity_none))) {
2912  KMP_WARNING(AffIgnoreInvalidProcID, start);
2913  }
2914  }
2915  else {
2916  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2917  (*setSize)++;
2918  }
2919  if (**scan == '}') {
2920  break;
2921  }
2922  (*scan)++; // skip ','
2923  continue;
2924  }
2925  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2926  (*scan)++; // skip ':'
2927 
2928  //
2929  // Read count parameter
2930  //
2931  SKIP_WS(*scan);
2932  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2933  "bad explicit places list");
2934  next = *scan;
2935  SKIP_DIGITS(next);
2936  count = __kmp_str_to_int(*scan, *next);
2937  KMP_ASSERT(count >= 0);
2938  *scan = next;
2939 
2940  //
2941  // valid follow sets are ',' ':' and '}'
2942  //
2943  SKIP_WS(*scan);
2944  if (**scan == '}' || **scan == ',') {
2945  for (i = 0; i < count; i++) {
2946  if ((start > maxOsId) ||
2947  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2948  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2949  && (__kmp_affinity_type != affinity_none))) {
2950  KMP_WARNING(AffIgnoreInvalidProcID, start);
2951  }
2952  break; // don't proliferate warnings for large count
2953  }
2954  else {
2955  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2956  start++;
2957  (*setSize)++;
2958  }
2959  }
2960  if (**scan == '}') {
2961  break;
2962  }
2963  (*scan)++; // skip ','
2964  continue;
2965  }
2966  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2967  (*scan)++; // skip ':'
2968 
2969  //
2970  // Read stride parameter
2971  //
2972  int sign = +1;
2973  for (;;) {
2974  SKIP_WS(*scan);
2975  if (**scan == '+') {
2976  (*scan)++; // skip '+'
2977  continue;
2978  }
2979  if (**scan == '-') {
2980  sign *= -1;
2981  (*scan)++; // skip '-'
2982  continue;
2983  }
2984  break;
2985  }
2986  SKIP_WS(*scan);
2987  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2988  "bad explicit places list");
2989  next = *scan;
2990  SKIP_DIGITS(next);
2991  stride = __kmp_str_to_int(*scan, *next);
2992  KMP_ASSERT(stride >= 0);
2993  *scan = next;
2994  stride *= sign;
2995 
2996  //
2997  // valid follow sets are ',' and '}'
2998  //
2999  SKIP_WS(*scan);
3000  if (**scan == '}' || **scan == ',') {
3001  for (i = 0; i < count; i++) {
3002  if ((start > maxOsId) ||
3003  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3004  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3005  && (__kmp_affinity_type != affinity_none))) {
3006  KMP_WARNING(AffIgnoreInvalidProcID, start);
3007  }
3008  break; // don't proliferate warnings for large count
3009  }
3010  else {
3011  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3012  start += stride;
3013  (*setSize)++;
3014  }
3015  }
3016  if (**scan == '}') {
3017  break;
3018  }
3019  (*scan)++; // skip ','
3020  continue;
3021  }
3022 
3023  KMP_ASSERT2(0, "bad explicit places list");
3024  }
3025 }
3026 
3027 
3028 static void
3029 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3030  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3031 {
3032  const char *next;
3033 
3034  //
3035  // valid follow sets are '{' '!' and num
3036  //
3037  SKIP_WS(*scan);
3038  if (**scan == '{') {
3039  (*scan)++; // skip '{'
3040  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3041  setSize);
3042  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3043  (*scan)++; // skip '}'
3044  }
3045  else if (**scan == '!') {
3046  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3047  KMP_CPU_COMPLEMENT(tempMask);
3048  (*scan)++; // skip '!'
3049  }
3050  else if ((**scan >= '0') && (**scan <= '9')) {
3051  next = *scan;
3052  SKIP_DIGITS(next);
3053  int num = __kmp_str_to_int(*scan, *next);
3054  KMP_ASSERT(num >= 0);
3055  if ((num > maxOsId) ||
3056  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3057  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3058  && (__kmp_affinity_type != affinity_none))) {
3059  KMP_WARNING(AffIgnoreInvalidProcID, num);
3060  }
3061  }
3062  else {
3063  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3064  (*setSize)++;
3065  }
3066  *scan = next; // skip num
3067  }
3068  else {
3069  KMP_ASSERT2(0, "bad explicit places list");
3070  }
3071 }
3072 
3073 
3074 static void
3075 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3076  unsigned int *out_numMasks, const char *placelist,
3077  kmp_affin_mask_t *osId2Mask, int maxOsId)
3078 {
3079  const char *scan = placelist;
3080  const char *next = placelist;
3081 
3082  numNewMasks = 2;
3083  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3084  * __kmp_affin_mask_size);
3085  nextNewMask = 0;
3086 
3087  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3088  __kmp_affin_mask_size);
3089  KMP_CPU_ZERO(tempMask);
3090  int setSize = 0;
3091 
3092  for (;;) {
3093  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3094 
3095  //
3096  // valid follow sets are ',' ':' and EOL
3097  //
3098  SKIP_WS(scan);
3099  if (*scan == '\0' || *scan == ',') {
3100  if (setSize > 0) {
3101  ADD_MASK(tempMask);
3102  }
3103  KMP_CPU_ZERO(tempMask);
3104  setSize = 0;
3105  if (*scan == '\0') {
3106  break;
3107  }
3108  scan++; // skip ','
3109  continue;
3110  }
3111 
3112  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3113  scan++; // skip ':'
3114 
3115  //
3116  // Read count parameter
3117  //
3118  SKIP_WS(scan);
3119  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3120  "bad explicit places list");
3121  next = scan;
3122  SKIP_DIGITS(next);
3123  int count = __kmp_str_to_int(scan, *next);
3124  KMP_ASSERT(count >= 0);
3125  scan = next;
3126 
3127  //
3128  // valid follow sets are ',' ':' and EOL
3129  //
3130  SKIP_WS(scan);
3131  if (*scan == '\0' || *scan == ',') {
3132  int i;
3133  for (i = 0; i < count; i++) {
3134  int j;
3135  if (setSize == 0) {
3136  break;
3137  }
3138  ADD_MASK(tempMask);
3139  setSize = 0;
3140  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) {
3141  //
3142  // Use a temp var in case macro is changed to evaluate
3143  // args multiple times.
3144  //
3145  if (KMP_CPU_ISSET(j - 1, tempMask)) {
3146  KMP_CPU_SET(j, tempMask);
3147  setSize++;
3148  }
3149  else {
3150  KMP_CPU_CLR(j, tempMask);
3151  }
3152  }
3153  for (; j >= 0; j--) {
3154  KMP_CPU_CLR(j, tempMask);
3155  }
3156  }
3157  KMP_CPU_ZERO(tempMask);
3158  setSize = 0;
3159 
3160  if (*scan == '\0') {
3161  break;
3162  }
3163  scan++; // skip ','
3164  continue;
3165  }
3166 
3167  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3168  scan++; // skip ':'
3169 
3170  //
3171  // Read stride parameter
3172  //
3173  int sign = +1;
3174  for (;;) {
3175  SKIP_WS(scan);
3176  if (*scan == '+') {
3177  scan++; // skip '+'
3178  continue;
3179  }
3180  if (*scan == '-') {
3181  sign *= -1;
3182  scan++; // skip '-'
3183  continue;
3184  }
3185  break;
3186  }
3187  SKIP_WS(scan);
3188  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3189  "bad explicit places list");
3190  next = scan;
3191  SKIP_DIGITS(next);
3192  int stride = __kmp_str_to_int(scan, *next);
3193  KMP_DEBUG_ASSERT(stride >= 0);
3194  scan = next;
3195  stride *= sign;
3196 
3197  if (stride > 0) {
3198  int i;
3199  for (i = 0; i < count; i++) {
3200  int j;
3201  if (setSize == 0) {
3202  break;
3203  }
3204  ADD_MASK(tempMask);
3205  setSize = 0;
3206  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3207  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3208  KMP_CPU_SET(j, tempMask);
3209  setSize++;
3210  }
3211  else {
3212  KMP_CPU_CLR(j, tempMask);
3213  }
3214  }
3215  for (; j >= 0; j--) {
3216  KMP_CPU_CLR(j, tempMask);
3217  }
3218  }
3219  }
3220  else {
3221  int i;
3222  for (i = 0; i < count; i++) {
3223  unsigned j;
3224  if (setSize == 0) {
3225  break;
3226  }
3227  ADD_MASK(tempMask);
3228  setSize = 0;
3229  for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride;
3230  j++) {
3231  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3232  KMP_CPU_SET(j, tempMask);
3233  setSize++;
3234  }
3235  else {
3236  KMP_CPU_CLR(j, tempMask);
3237  }
3238  }
3239  for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) {
3240  KMP_CPU_CLR(j, tempMask);
3241  }
3242  }
3243  }
3244  KMP_CPU_ZERO(tempMask);
3245  setSize = 0;
3246 
3247  //
3248  // valid follow sets are ',' and EOL
3249  //
3250  SKIP_WS(scan);
3251  if (*scan == '\0') {
3252  break;
3253  }
3254  if (*scan == ',') {
3255  scan++; // skip ','
3256  continue;
3257  }
3258 
3259  KMP_ASSERT2(0, "bad explicit places list");
3260  }
3261 
3262  *out_numMasks = nextNewMask;
3263  if (nextNewMask == 0) {
3264  *out_masks = NULL;
3265  KMP_INTERNAL_FREE(newMasks);
3266  return;
3267  }
3268  *out_masks
3269  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3270  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3271  __kmp_free(tempMask);
3272  KMP_INTERNAL_FREE(newMasks);
3273 }
3274 
3275 # endif /* OMP_40_ENABLED */
3276 
3277 #undef ADD_MASK
3278 #undef ADD_MASK_OSID
3279 
3280 
3281 # if KMP_MIC
3282 
3283 static void
3284 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3285 {
3286  if ( __kmp_place_num_cores == 0 ) {
3287  if ( __kmp_place_num_threads_per_core == 0 ) {
3288  return; // no cores limiting actions requested, exit
3289  }
3290  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3291  }
3292  if ( !__kmp_affinity_uniform_topology() || depth != 3 ) {
3293  KMP_WARNING( AffThrPlaceUnsupported );
3294  return; // don't support non-uniform topology or not-3-level architecture
3295  }
3296  if ( __kmp_place_num_threads_per_core == 0 ) {
3297  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3298  }
3299  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3300  KMP_WARNING( AffThrPlaceManyCores );
3301  return;
3302  }
3303 
3304  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3305  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3306  int i, j, k, n_old = 0, n_new = 0;
3307  for ( i = 0; i < nPackages; ++i ) {
3308  for ( j = 0; j < nCoresPerPkg; ++j ) {
3309  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3310  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3311  } else {
3312  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3313  if ( k < __kmp_place_num_threads_per_core ) {
3314  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3315  n_new++;
3316  }
3317  n_old++;
3318  }
3319  }
3320  }
3321  }
3322  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3323  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3324  __kmp_avail_proc = n_new; // correct avail_proc
3325  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3326 
3327  __kmp_free( *pAddr );
3328  *pAddr = newAddr; // replace old topology with new one
3329 }
3330 
3331 # endif /* KMP_MIC */
3332 
3333 
3334 static AddrUnsPair *address2os = NULL;
3335 static int * procarr = NULL;
3336 static int __kmp_aff_depth = 0;
3337 
3338 static void
3339 __kmp_aux_affinity_initialize(void)
3340 {
3341  if (__kmp_affinity_masks != NULL) {
3342  KMP_ASSERT(fullMask != NULL);
3343  return;
3344  }
3345 
3346  //
3347  // Create the "full" mask - this defines all of the processors that we
3348  // consider to be in the machine model. If respect is set, then it is
3349  // the initialization thread's affinity mask. Otherwise, it is all
3350  // processors that we know about on the machine.
3351  //
3352  if (fullMask == NULL) {
3353  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3354  }
3355  if (KMP_AFFINITY_CAPABLE()) {
3356  if (__kmp_affinity_respect_mask) {
3357  __kmp_get_system_affinity(fullMask, TRUE);
3358 
3359  //
3360  // Count the number of available processors.
3361  //
3362  unsigned i;
3363  __kmp_avail_proc = 0;
3364  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3365  if (! KMP_CPU_ISSET(i, fullMask)) {
3366  continue;
3367  }
3368  __kmp_avail_proc++;
3369  }
3370  if (__kmp_avail_proc > __kmp_xproc) {
3371  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3372  && (__kmp_affinity_type != affinity_none))) {
3373  KMP_WARNING(ErrorInitializeAffinity);
3374  }
3375  __kmp_affinity_type = affinity_none;
3376  __kmp_affin_mask_size = 0;
3377  return;
3378  }
3379  }
3380  else {
3381  __kmp_affinity_entire_machine_mask(fullMask);
3382  __kmp_avail_proc = __kmp_xproc;
3383  }
3384  }
3385 
3386  int depth = -1;
3387  kmp_i18n_id_t msg_id = kmp_i18n_null;
3388 
3389  //
3390  // For backward compatiblity, setting KMP_CPUINFO_FILE =>
3391  // KMP_TOPOLOGY_METHOD=cpuinfo
3392  //
3393  if ((__kmp_cpuinfo_file != NULL) &&
3394  (__kmp_affinity_top_method == affinity_top_method_all)) {
3395  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3396  }
3397 
3398  if (__kmp_affinity_top_method == affinity_top_method_all) {
3399  //
3400  // In the default code path, errors are not fatal - we just try using
3401  // another method. We only emit a warning message if affinity is on,
3402  // or the verbose flag is set, an the nowarnings flag was not set.
3403  //
3404  const char *file_name = NULL;
3405  int line = 0;
3406 
3407 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3408 
3409  if (__kmp_affinity_verbose) {
3410  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3411  }
3412 
3413  file_name = NULL;
3414  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3415  if (depth == 0) {
3416  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3417  KMP_ASSERT(address2os == NULL);
3418  return;
3419  }
3420 
3421  if (depth < 0) {
3422  if ((msg_id != kmp_i18n_null)
3423  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3424  && (__kmp_affinity_type != affinity_none)))) {
3425 # if KMP_MIC
3426  if (__kmp_affinity_verbose) {
3427  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3428  KMP_I18N_STR(DecodingLegacyAPIC));
3429  }
3430 # else
3431  KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3432  KMP_I18N_STR(DecodingLegacyAPIC));
3433 # endif
3434  }
3435 
3436  file_name = NULL;
3437  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3438  if (depth == 0) {
3439  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3440  KMP_ASSERT(address2os == NULL);
3441  return;
3442  }
3443  }
3444 
3445 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3446 
3447 # if KMP_OS_LINUX
3448 
3449  if (depth < 0) {
3450  if ((msg_id != kmp_i18n_null)
3451  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3452  && (__kmp_affinity_type != affinity_none)))) {
3453 # if KMP_MIC
3454  if (__kmp_affinity_verbose) {
3455  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3456  }
3457 # else
3458  KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3459 # endif
3460  }
3461  else if (__kmp_affinity_verbose) {
3462  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3463  }
3464 
3465  FILE *f = fopen("/proc/cpuinfo", "r");
3466  if (f == NULL) {
3467  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3468  }
3469  else {
3470  file_name = "/proc/cpuinfo";
3471  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3472  fclose(f);
3473  if (depth == 0) {
3474  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3475  KMP_ASSERT(address2os == NULL);
3476  return;
3477  }
3478  }
3479  }
3480 
3481 # endif /* KMP_OS_LINUX */
3482 
3483  if (depth < 0) {
3484  if (msg_id != kmp_i18n_null
3485  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3486  && (__kmp_affinity_type != affinity_none)))) {
3487  if (file_name == NULL) {
3488  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3489  }
3490  else if (line == 0) {
3491  KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3492  }
3493  else {
3494  KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3495  }
3496  }
3497 
3498  file_name = "";
3499  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3500  if (depth == 0) {
3501  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3502  KMP_ASSERT(address2os == NULL);
3503  return;
3504  }
3505  KMP_ASSERT(depth > 0);
3506  KMP_ASSERT(address2os != NULL);
3507  }
3508  }
3509 
3510  //
3511  // If the user has specified that a paricular topology discovery method
3512  // is to be used, then we abort if that method fails. The exception is
3513  // group affinity, which might have been implicitly set.
3514  //
3515 
3516 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3517 
3518  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3519  if (__kmp_affinity_verbose) {
3520  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3521  KMP_I18N_STR(Decodingx2APIC));
3522  }
3523 
3524  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3525  if (depth == 0) {
3526  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3527  KMP_ASSERT(address2os == NULL);
3528  return;
3529  }
3530 
3531  if (depth < 0) {
3532  KMP_ASSERT(msg_id != kmp_i18n_null);
3533  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3534  }
3535  }
3536  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3537  if (__kmp_affinity_verbose) {
3538  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3539  KMP_I18N_STR(DecodingLegacyAPIC));
3540  }
3541 
3542  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3543  if (depth == 0) {
3544  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3545  KMP_ASSERT(address2os == NULL);
3546  return;
3547  }
3548 
3549  if (depth < 0) {
3550  KMP_ASSERT(msg_id != kmp_i18n_null);
3551  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3552  }
3553  }
3554 
3555 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3556 
3557  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3558  const char *filename;
3559  if (__kmp_cpuinfo_file != NULL) {
3560  filename = __kmp_cpuinfo_file;
3561  }
3562  else {
3563  filename = "/proc/cpuinfo";
3564  }
3565 
3566  if (__kmp_affinity_verbose) {
3567  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3568  }
3569 
3570  FILE *f = fopen(filename, "r");
3571  if (f == NULL) {
3572  int code = errno;
3573  if (__kmp_cpuinfo_file != NULL) {
3574  __kmp_msg(
3575  kmp_ms_fatal,
3576  KMP_MSG(CantOpenFileForReading, filename),
3577  KMP_ERR(code),
3578  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3579  __kmp_msg_null
3580  );
3581  }
3582  else {
3583  __kmp_msg(
3584  kmp_ms_fatal,
3585  KMP_MSG(CantOpenFileForReading, filename),
3586  KMP_ERR(code),
3587  __kmp_msg_null
3588  );
3589  }
3590  }
3591  int line = 0;
3592  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3593  fclose(f);
3594  if (depth < 0) {
3595  KMP_ASSERT(msg_id != kmp_i18n_null);
3596  if (line > 0) {
3597  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3598  }
3599  else {
3600  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3601  }
3602  }
3603  if (__kmp_affinity_type == affinity_none) {
3604  KMP_ASSERT(depth == 0);
3605  KMP_ASSERT(address2os == NULL);
3606  return;
3607  }
3608  }
3609 
3610 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3611 
3612  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3613  if (__kmp_affinity_verbose) {
3614  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3615  }
3616 
3617  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3618  KMP_ASSERT(depth != 0);
3619 
3620  if (depth < 0) {
3621  if ((msg_id != kmp_i18n_null)
3622  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3623  && (__kmp_affinity_type != affinity_none)))) {
3624  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3625  }
3626 
3627  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3628  if (depth == 0) {
3629  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3630  KMP_ASSERT(address2os == NULL);
3631  return;
3632  }
3633  // should not fail
3634  KMP_ASSERT(depth > 0);
3635  KMP_ASSERT(address2os != NULL);
3636  }
3637  }
3638 
3639 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3640 
3641  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3642  if (__kmp_affinity_verbose) {
3643  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3644  }
3645 
3646  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3647  if (depth == 0) {
3648  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3649  KMP_ASSERT(address2os == NULL);
3650  return;
3651  }
3652  // should not fail
3653  KMP_ASSERT(depth > 0);
3654  KMP_ASSERT(address2os != NULL);
3655  }
3656 
3657  if (address2os == NULL) {
3658  if (KMP_AFFINITY_CAPABLE()
3659  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3660  && (__kmp_affinity_type != affinity_none)))) {
3661  KMP_WARNING(ErrorInitializeAffinity);
3662  }
3663  __kmp_affinity_type = affinity_none;
3664  __kmp_affin_mask_size = 0;
3665  return;
3666  }
3667 
3668 # if KMP_MIC
3669  __kmp_apply_thread_places(&address2os, depth);
3670 # endif
3671 
3672  //
3673  // Create the table of masks, indexed by thread Id.
3674  //
3675  unsigned maxIndex;
3676  unsigned numUnique;
3677  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3678  address2os, __kmp_avail_proc);
3679  if (__kmp_affinity_gran_levels == 0) {
3680  KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc);
3681  }
3682 
3683  //
3684  // Set the childNums vector in all Address objects. This must be done
3685  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3686  // which takes into account the setting of __kmp_affinity_compact.
3687  //
3688  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3689 
3690  switch (__kmp_affinity_type) {
3691 
3692  case affinity_explicit:
3693  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3694 # if OMP_40_ENABLED
3695  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3696 # endif
3697  {
3698  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3699  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3700  maxIndex);
3701  }
3702 # if OMP_40_ENABLED
3703  else {
3704  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3705  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3706  maxIndex);
3707  }
3708 # endif
3709  if (__kmp_affinity_num_masks == 0) {
3710  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3711  && (__kmp_affinity_type != affinity_none))) {
3712  KMP_WARNING(AffNoValidProcID);
3713  }
3714  __kmp_affinity_type = affinity_none;
3715  return;
3716  }
3717  break;
3718 
3719  //
3720  // The other affinity types rely on sorting the Addresses according
3721  // to some permutation of the machine topology tree. Set
3722  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3723  // then jump to a common code fragment to do the sort and create
3724  // the array of affinity masks.
3725  //
3726 
3727  case affinity_logical:
3728  __kmp_affinity_compact = 0;
3729  if (__kmp_affinity_offset) {
3730  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3731  % __kmp_avail_proc;
3732  }
3733  goto sortAddresses;
3734 
3735  case affinity_physical:
3736  if (__kmp_nThreadsPerCore > 1) {
3737  __kmp_affinity_compact = 1;
3738  if (__kmp_affinity_compact >= depth) {
3739  __kmp_affinity_compact = 0;
3740  }
3741  } else {
3742  __kmp_affinity_compact = 0;
3743  }
3744  if (__kmp_affinity_offset) {
3745  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3746  % __kmp_avail_proc;
3747  }
3748  goto sortAddresses;
3749 
3750  case affinity_scatter:
3751  if (__kmp_affinity_compact >= depth) {
3752  __kmp_affinity_compact = 0;
3753  }
3754  else {
3755  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3756  }
3757  goto sortAddresses;
3758 
3759  case affinity_compact:
3760  if (__kmp_affinity_compact >= depth) {
3761  __kmp_affinity_compact = depth - 1;
3762  }
3763  goto sortAddresses;
3764 
3765 # if KMP_MIC
3766  case affinity_balanced:
3767  // Balanced works only for the case of a single package and uniform topology
3768  if( nPackages > 1 ) {
3769  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3770  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3771  }
3772  __kmp_affinity_type = affinity_none;
3773  return;
3774  } else if( __kmp_affinity_uniform_topology() ) {
3775  break;
3776  } else { // Non-uniform topology
3777 
3778  // Save the depth for further usage
3779  __kmp_aff_depth = depth;
3780 
3781  // Number of hyper threads per core in HT machine
3782  int nth_per_core = __kmp_nThreadsPerCore;
3783 
3784  int core_level;
3785  if( nth_per_core > 1 ) {
3786  core_level = depth - 2;
3787  } else {
3788  core_level = depth - 1;
3789  }
3790  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3791  int nproc = nth_per_core * ncores;
3792 
3793  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3794  for( int i = 0; i < nproc; i++ ) {
3795  procarr[ i ] = -1;
3796  }
3797 
3798  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3799  int proc = address2os[ i ].second;
3800  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3801  // If there is only one thread per core then depth == 2: level 0 - package,
3802  // level 1 - core.
3803  int level = depth - 1;
3804 
3805  // __kmp_nth_per_core == 1
3806  int thread = 0;
3807  int core = address2os[ i ].first.labels[ level ];
3808  // If the thread level exists, that is we have more than one thread context per core
3809  if( nth_per_core > 1 ) {
3810  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3811  core = address2os[ i ].first.labels[ level - 1 ];
3812  }
3813  procarr[ core * nth_per_core + thread ] = proc;
3814  }
3815 
3816  break;
3817  }
3818 # endif
3819 
3820  sortAddresses:
3821  //
3822  // Allocate the gtid->affinity mask table.
3823  //
3824  if (__kmp_affinity_dups) {
3825  __kmp_affinity_num_masks = __kmp_avail_proc;
3826  }
3827  else {
3828  __kmp_affinity_num_masks = numUnique;
3829  }
3830 
3831 # if OMP_40_ENABLED
3832  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3833  && ( __kmp_affinity_num_places > 0 )
3834  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3835  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3836  }
3837 # endif
3838 
3839  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3840  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3841 
3842  //
3843  // Sort the address2os table according to the current setting of
3844  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3845  //
3846  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3847  __kmp_affinity_cmp_Address_child_num);
3848  {
3849  int i;
3850  unsigned j;
3851  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3852  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3853  continue;
3854  }
3855  unsigned osId = address2os[i].second;
3856  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3857  kmp_affin_mask_t *dest
3858  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3859  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3860  KMP_CPU_COPY(dest, src);
3861  if (++j >= __kmp_affinity_num_masks) {
3862  break;
3863  }
3864  }
3865  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3866  }
3867  break;
3868 
3869  default:
3870  KMP_ASSERT2(0, "Unexpected affinity setting");
3871  }
3872 
3873  __kmp_free(osId2Mask);
3874 }
3875 
3876 
3877 void
3878 __kmp_affinity_initialize(void)
3879 {
3880  //
3881  // Much of the code above was written assumming that if a machine was not
3882  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3883  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3884  //
3885  // There are too many checks for __kmp_affinity_type == affinity_none
3886  // in this code. Instead of trying to change them all, check if
3887  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3888  // affinity_none, call the real initialization routine, then restore
3889  // __kmp_affinity_type to affinity_disabled.
3890  //
3891  int disabled = (__kmp_affinity_type == affinity_disabled);
3892  if (! KMP_AFFINITY_CAPABLE()) {
3893  KMP_ASSERT(disabled);
3894  }
3895  if (disabled) {
3896  __kmp_affinity_type = affinity_none;
3897  }
3898  __kmp_aux_affinity_initialize();
3899  if (disabled) {
3900  __kmp_affinity_type = affinity_disabled;
3901  }
3902 }
3903 
3904 
3905 void
3906 __kmp_affinity_uninitialize(void)
3907 {
3908  if (__kmp_affinity_masks != NULL) {
3909  __kmp_free(__kmp_affinity_masks);
3910  __kmp_affinity_masks = NULL;
3911  }
3912  if (fullMask != NULL) {
3913  KMP_CPU_FREE(fullMask);
3914  fullMask = NULL;
3915  }
3916  __kmp_affinity_num_masks = 0;
3917 # if OMP_40_ENABLED
3918  __kmp_affinity_num_places = 0;
3919 # endif
3920  if (__kmp_affinity_proclist != NULL) {
3921  __kmp_free(__kmp_affinity_proclist);
3922  __kmp_affinity_proclist = NULL;
3923  }
3924  if( address2os != NULL ) {
3925  __kmp_free( address2os );
3926  address2os = NULL;
3927  }
3928  if( procarr != NULL ) {
3929  __kmp_free( procarr );
3930  procarr = NULL;
3931  }
3932 }
3933 
3934 
3935 void
3936 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3937 {
3938  if (! KMP_AFFINITY_CAPABLE()) {
3939  return;
3940  }
3941 
3942  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3943  if (th->th.th_affin_mask == NULL) {
3944  KMP_CPU_ALLOC(th->th.th_affin_mask);
3945  }
3946  else {
3947  KMP_CPU_ZERO(th->th.th_affin_mask);
3948  }
3949 
3950  //
3951  // Copy the thread mask to the kmp_info_t strucuture.
3952  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3953  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3954  // is set, then the full mask is the same as the mask of the initialization
3955  // thread.
3956  //
3957  kmp_affin_mask_t *mask;
3958  int i;
3959 
3960 # if OMP_40_ENABLED
3961  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3962 # endif
3963  {
3964  if ((__kmp_affinity_type == affinity_none)
3965 # if KMP_MIC
3966  || (__kmp_affinity_type == affinity_balanced)
3967 # endif
3968  ) {
3969 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3970  if (__kmp_num_proc_groups > 1) {
3971  return;
3972  }
3973 # endif
3974  KMP_ASSERT(fullMask != NULL);
3975  i = -1;
3976  mask = fullMask;
3977  }
3978  else {
3979  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3980  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3981  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3982  }
3983  }
3984 # if OMP_40_ENABLED
3985  else {
3986  if ((! isa_root)
3987  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3988 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3989  if (__kmp_num_proc_groups > 1) {
3990  return;
3991  }
3992 # endif
3993  KMP_ASSERT(fullMask != NULL);
3994  i = KMP_PLACE_ALL;
3995  mask = fullMask;
3996  }
3997  else {
3998  //
3999  // int i = some hash function or just a counter that doesn't
4000  // always start at 0. Use gtid for now.
4001  //
4002  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4003  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4004  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4005  }
4006  }
4007 # endif
4008 
4009 # if OMP_40_ENABLED
4010  th->th.th_current_place = i;
4011  if (isa_root) {
4012  th->th.th_new_place = i;
4013  th->th.th_first_place = 0;
4014  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4015  }
4016 
4017  if (i == KMP_PLACE_ALL) {
4018  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4019  gtid));
4020  }
4021  else {
4022  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4023  gtid, i));
4024  }
4025 # else
4026  if (i == -1) {
4027  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4028  gtid));
4029  }
4030  else {
4031  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4032  gtid, i));
4033  }
4034 # endif /* OMP_40_ENABLED */
4035 
4036  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4037 
4038  if (__kmp_affinity_verbose) {
4039  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4040  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4041  th->th.th_affin_mask);
4042  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf);
4043  }
4044 
4045 # if KMP_OS_WINDOWS
4046  //
4047  // On Windows* OS, the process affinity mask might have changed.
4048  // If the user didn't request affinity and this call fails,
4049  // just continue silently. See CQ171393.
4050  //
4051  if ( __kmp_affinity_type == affinity_none ) {
4052  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4053  }
4054  else
4055 # endif
4056  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4057 }
4058 
4059 
4060 # if OMP_40_ENABLED
4061 
4062 void
4063 __kmp_affinity_set_place(int gtid)
4064 {
4065  int retval;
4066 
4067  if (! KMP_AFFINITY_CAPABLE()) {
4068  return;
4069  }
4070 
4071  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4072 
4073  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4074  gtid, th->th.th_new_place, th->th.th_current_place));
4075 
4076  //
4077  // Check that the new place is withing this thread's partition.
4078  //
4079  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4080  KMP_DEBUG_ASSERT(th->th.th_new_place >= 0);
4081  KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4082  if (th->th.th_first_place <= th->th.th_last_place) {
4083  KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4084  && (th->th.th_new_place <= th->th.th_last_place));
4085  }
4086  else {
4087  KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4088  || (th->th.th_new_place >= th->th.th_last_place));
4089  }
4090 
4091  //
4092  // Copy the thread mask to the kmp_info_t strucuture,
4093  // and set this thread's affinity.
4094  //
4095  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4096  th->th.th_new_place);
4097  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4098  th->th.th_current_place = th->th.th_new_place;
4099 
4100  if (__kmp_affinity_verbose) {
4101  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4102  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4103  th->th.th_affin_mask);
4104  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf);
4105  }
4106  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4107 }
4108 
4109 # endif /* OMP_40_ENABLED */
4110 
4111 
4112 int
4113 __kmp_aux_set_affinity(void **mask)
4114 {
4115  int gtid;
4116  kmp_info_t *th;
4117  int retval;
4118 
4119  if (! KMP_AFFINITY_CAPABLE()) {
4120  return -1;
4121  }
4122 
4123  gtid = __kmp_entry_gtid();
4124  KA_TRACE(1000, ;{
4125  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4126  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4127  (kmp_affin_mask_t *)(*mask));
4128  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4129  gtid, buf);
4130  });
4131 
4132  if (__kmp_env_consistency_check) {
4133  if ((mask == NULL) || (*mask == NULL)) {
4134  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4135  }
4136  else {
4137  unsigned proc;
4138  int num_procs = 0;
4139 
4140  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4141  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4142  continue;
4143  }
4144  num_procs++;
4145  if (! KMP_CPU_ISSET(proc, fullMask)) {
4146  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4147  break;
4148  }
4149  }
4150  if (num_procs == 0) {
4151  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4152  }
4153 
4154 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4155  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4156  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4157  }
4158 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4159 
4160  }
4161  }
4162 
4163  th = __kmp_threads[gtid];
4164  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4165  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4166  if (retval == 0) {
4167  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4168  }
4169 
4170 # if OMP_40_ENABLED
4171  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4172  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4173  th->th.th_first_place = 0;
4174  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4175 # endif
4176 
4177  return retval;
4178 }
4179 
4180 
4181 int
4182 __kmp_aux_get_affinity(void **mask)
4183 {
4184  int gtid;
4185  int retval;
4186  kmp_info_t *th;
4187 
4188  if (! KMP_AFFINITY_CAPABLE()) {
4189  return -1;
4190  }
4191 
4192  gtid = __kmp_entry_gtid();
4193  th = __kmp_threads[gtid];
4194  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4195 
4196  KA_TRACE(1000, ;{
4197  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4198  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4199  th->th.th_affin_mask);
4200  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4201  });
4202 
4203  if (__kmp_env_consistency_check) {
4204  if ((mask == NULL) || (*mask == NULL)) {
4205  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4206  }
4207  }
4208 
4209 # if !KMP_OS_WINDOWS
4210 
4211  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4212  KA_TRACE(1000, ;{
4213  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4214  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4215  (kmp_affin_mask_t *)(*mask));
4216  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4217  });
4218  return retval;
4219 
4220 # else
4221 
4222  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4223  return 0;
4224 
4225 # endif /* KMP_OS_WINDOWS */
4226 
4227 }
4228 
4229 
4230 int
4231 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4232 {
4233  int retval;
4234 
4235  if (! KMP_AFFINITY_CAPABLE()) {
4236  return -1;
4237  }
4238 
4239  KA_TRACE(1000, ;{
4240  int gtid = __kmp_entry_gtid();
4241  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4242  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4243  (kmp_affin_mask_t *)(*mask));
4244  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4245  proc, gtid, buf);
4246  });
4247 
4248  if (__kmp_env_consistency_check) {
4249  if ((mask == NULL) || (*mask == NULL)) {
4250  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4251  }
4252  }
4253 
4254  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4255  return -1;
4256  }
4257  if (! KMP_CPU_ISSET(proc, fullMask)) {
4258  return -2;
4259  }
4260 
4261  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4262  return 0;
4263 }
4264 
4265 
4266 int
4267 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4268 {
4269  int retval;
4270 
4271  if (! KMP_AFFINITY_CAPABLE()) {
4272  return -1;
4273  }
4274 
4275  KA_TRACE(1000, ;{
4276  int gtid = __kmp_entry_gtid();
4277  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4278  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4279  (kmp_affin_mask_t *)(*mask));
4280  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4281  proc, gtid, buf);
4282  });
4283 
4284  if (__kmp_env_consistency_check) {
4285  if ((mask == NULL) || (*mask == NULL)) {
4286  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4287  }
4288  }
4289 
4290  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4291  return -1;
4292  }
4293  if (! KMP_CPU_ISSET(proc, fullMask)) {
4294  return -2;
4295  }
4296 
4297  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4298  return 0;
4299 }
4300 
4301 
4302 int
4303 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4304 {
4305  int retval;
4306 
4307  if (! KMP_AFFINITY_CAPABLE()) {
4308  return -1;
4309  }
4310 
4311  KA_TRACE(1000, ;{
4312  int gtid = __kmp_entry_gtid();
4313  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4314  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4315  (kmp_affin_mask_t *)(*mask));
4316  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4317  proc, gtid, buf);
4318  });
4319 
4320  if (__kmp_env_consistency_check) {
4321  if ((mask == NULL) || (*mask == NULL)) {
4322  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4323  }
4324  }
4325 
4326  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4327  return 0;
4328  }
4329  if (! KMP_CPU_ISSET(proc, fullMask)) {
4330  return 0;
4331  }
4332 
4333  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4334 }
4335 
4336 # if KMP_MIC
4337 
4338 // Dynamic affinity settings - Affinity balanced
4339 void __kmp_balanced_affinity( int tid, int nthreads )
4340 {
4341  if( __kmp_affinity_uniform_topology() ) {
4342  int coreID;
4343  int threadID;
4344  // Number of hyper threads per core in HT machine
4345  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4346  // Number of cores
4347  int ncores = __kmp_ncores;
4348  // How many threads will be bound to each core
4349  int chunk = nthreads / ncores;
4350  // How many cores will have an additional thread bound to it - "big cores"
4351  int big_cores = nthreads % ncores;
4352  // Number of threads on the big cores
4353  int big_nth = ( chunk + 1 ) * big_cores;
4354  if( tid < big_nth ) {
4355  coreID = tid / (chunk + 1 );
4356  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4357  } else { //tid >= big_nth
4358  coreID = ( tid - big_cores ) / chunk;
4359  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4360  }
4361 
4362  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4363  "Illegal set affinity operation when not capable");
4364 
4365  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4366  KMP_CPU_ZERO(mask);
4367 
4368  // Granularity == thread
4369  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4370  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4371  KMP_CPU_SET( osID, mask);
4372  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4373  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4374  int osID;
4375  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4376  KMP_CPU_SET( osID, mask);
4377  }
4378  }
4379  if (__kmp_affinity_verbose) {
4380  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4381  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4382  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4383  }
4384  __kmp_set_system_affinity( mask, TRUE );
4385  } else { // Non-uniform topology
4386 
4387  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4388  KMP_CPU_ZERO(mask);
4389 
4390  // Number of hyper threads per core in HT machine
4391  int nth_per_core = __kmp_nThreadsPerCore;
4392  int core_level;
4393  if( nth_per_core > 1 ) {
4394  core_level = __kmp_aff_depth - 2;
4395  } else {
4396  core_level = __kmp_aff_depth - 1;
4397  }
4398 
4399  // Number of cores - maximum value; it does not count trail cores with 0 processors
4400  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4401 
4402  // For performance gain consider the special case nthreads == __kmp_avail_proc
4403  if( nthreads == __kmp_avail_proc ) {
4404  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4405  int osID = address2os[ tid ].second;
4406  KMP_CPU_SET( osID, mask);
4407  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4408  int coreID = address2os[ tid ].first.labels[ core_level ];
4409  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4410  // since the address2os is sortied we can break when cnt==nth_per_core
4411  int cnt = 0;
4412  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4413  int osID = address2os[ i ].second;
4414  int core = address2os[ i ].first.labels[ core_level ];
4415  if( core == coreID ) {
4416  KMP_CPU_SET( osID, mask);
4417  cnt++;
4418  if( cnt == nth_per_core ) {
4419  break;
4420  }
4421  }
4422  }
4423  }
4424  } else if( nthreads <= __kmp_ncores ) {
4425 
4426  int core = 0;
4427  for( int i = 0; i < ncores; i++ ) {
4428  // Check if this core from procarr[] is in the mask
4429  int in_mask = 0;
4430  for( int j = 0; j < nth_per_core; j++ ) {
4431  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4432  in_mask = 1;
4433  break;
4434  }
4435  }
4436  if( in_mask ) {
4437  if( tid == core ) {
4438  for( int j = 0; j < nth_per_core; j++ ) {
4439  int osID = procarr[ i * nth_per_core + j ];
4440  if( osID != -1 ) {
4441  KMP_CPU_SET( osID, mask );
4442  // For granularity=thread it is enough to set the first available osID for this core
4443  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4444  break;
4445  }
4446  }
4447  }
4448  break;
4449  } else {
4450  core++;
4451  }
4452  }
4453  }
4454 
4455  } else { // nthreads > __kmp_ncores
4456 
4457  // Array to save the number of processors at each core
4458  int nproc_at_core[ ncores ];
4459  // Array to save the number of cores with "x" available processors;
4460  int ncores_with_x_procs[ nth_per_core + 1 ];
4461  // Array to save the number of cores with # procs from x to nth_per_core
4462  int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4463 
4464  for( int i = 0; i <= nth_per_core; i++ ) {
4465  ncores_with_x_procs[ i ] = 0;
4466  ncores_with_x_to_max_procs[ i ] = 0;
4467  }
4468 
4469  for( int i = 0; i < ncores; i++ ) {
4470  int cnt = 0;
4471  for( int j = 0; j < nth_per_core; j++ ) {
4472  if( procarr[ i * nth_per_core + j ] != -1 ) {
4473  cnt++;
4474  }
4475  }
4476  nproc_at_core[ i ] = cnt;
4477  ncores_with_x_procs[ cnt ]++;
4478  }
4479 
4480  for( int i = 0; i <= nth_per_core; i++ ) {
4481  for( int j = i; j <= nth_per_core; j++ ) {
4482  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4483  }
4484  }
4485 
4486  // Max number of processors
4487  int nproc = nth_per_core * ncores;
4488  // An array to keep number of threads per each context
4489  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4490  for( int i = 0; i < nproc; i++ ) {
4491  newarr[ i ] = 0;
4492  }
4493 
4494  int nth = nthreads;
4495  int flag = 0;
4496  while( nth > 0 ) {
4497  for( int j = 1; j <= nth_per_core; j++ ) {
4498  int cnt = ncores_with_x_to_max_procs[ j ];
4499  for( int i = 0; i < ncores; i++ ) {
4500  // Skip the core with 0 processors
4501  if( nproc_at_core[ i ] == 0 ) {
4502  continue;
4503  }
4504  for( int k = 0; k < nth_per_core; k++ ) {
4505  if( procarr[ i * nth_per_core + k ] != -1 ) {
4506  if( newarr[ i * nth_per_core + k ] == 0 ) {
4507  newarr[ i * nth_per_core + k ] = 1;
4508  cnt--;
4509  nth--;
4510  break;
4511  } else {
4512  if( flag != 0 ) {
4513  newarr[ i * nth_per_core + k ] ++;
4514  cnt--;
4515  nth--;
4516  break;
4517  }
4518  }
4519  }
4520  }
4521  if( cnt == 0 || nth == 0 ) {
4522  break;
4523  }
4524  }
4525  if( nth == 0 ) {
4526  break;
4527  }
4528  }
4529  flag = 1;
4530  }
4531  int sum = 0;
4532  for( int i = 0; i < nproc; i++ ) {
4533  sum += newarr[ i ];
4534  if( sum > tid ) {
4535  // Granularity == thread
4536  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4537  int osID = procarr[ i ];
4538  KMP_CPU_SET( osID, mask);
4539  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4540  int coreID = i / nth_per_core;
4541  for( int ii = 0; ii < nth_per_core; ii++ ) {
4542  int osID = procarr[ coreID * nth_per_core + ii ];
4543  if( osID != -1 ) {
4544  KMP_CPU_SET( osID, mask);
4545  }
4546  }
4547  }
4548  break;
4549  }
4550  }
4551  __kmp_free( newarr );
4552  }
4553 
4554  if (__kmp_affinity_verbose) {
4555  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4556  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4557  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4558  }
4559  __kmp_set_system_affinity( mask, TRUE );
4560  }
4561 }
4562 
4563 # endif /* KMP_MIC */
4564 
4565 #elif KMP_OS_DARWIN
4566  // affinity not supported
4567 #else
4568  #error "Unknown or unsupported OS"
4569 #endif // KMP_OS_WINDOWS || KMP_OS_LINUX
4570