Teach irqbalance about Intel CoD.

This originally surfaced as a bug in placing network interrupts.  In
the case that this submitter observed, the NIC card was in NUMA domain
0, but each RSS interrupt was getting an affinity list for all CPUs in
the domain.  The expected behavior is for a single cpu to be chosen when
attempting to fan out NIC interrupts.  Due to other implementation
details of interrupt placement, this effectively caused all interrupt
mappings for this NIC to end up on CPU 0.

The bug turns out ot have been caused by Intel Cluster on Die breaking
an assumption in irqbalance about the design of the component hierarchy.
The CoD topology allows a CPU package to belong to more than one NUMA
node, which is not expected.

The RCA was that when the second NUMA node was wired up to the existing
physical package, it overwrote the mappings that were placed there by
the first.

This patch attempts to solve that problem by permitting a package to
have multiple NUMA nodes.  The CPU component hierarchy is preserved, in
case other parts of the code depend upon walking it.  When a CoD
topology is detected, the NUMA node -> CPU component mapping is moved
down a level, so that the nodes point to the first level where the
affinity becomes distinct.  In practice, this has been observed to be
the LLC.

A quick illustration (now, with COD, it looks like this):

                 +-----------+
                 | NUMA Node |
                 |     0     |
                 +-----------+
                       |
                       |        +-------+
                      \|/     / | CPU 0 |
                   +---------+  +-------+
                   | Cache 0 |
                   +---------+  +-------+
                   /          \ | CPU 1 |
      +-----------+             +-------+
      | Package 0 |
      +-----------+             +-------+
                  \           / | CPU 2 |
                   +---------+  +-------+
                   | Cache 1 |
                   +---------+
                       ^      \ +-------+
                       |        | CPU 3 |
                       |        +-------+
                 +-----------+
                 | NUMA Node |
                 |     1     |
                 +-----------+

Whereas, previously only NUMA Node 1 would end up pointing to package 0.
The topology should not be different on platforms that do not enable
CoD.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
This commit is contained in:
Krister Johansen 2017-07-10 17:00:27 -07:00
parent 9ea96c1fd8
commit 7bc1244fbf
5 changed files with 94 additions and 41 deletions

View file

@ -121,8 +121,32 @@ out:
log(TO_CONSOLE, LOG_INFO, "Banned CPUs: %s\n", buffer);
}
static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache,
int packageid, cpumask_t package_mask)
static void add_numa_node_to_topo_obj(struct topo_obj *obj, int nodeid)
{
GList *entry;
struct topo_obj *node;
struct topo_obj *cand_node;
node = get_numa_node(nodeid);
if (!node || node->number == -1)
return;
entry = g_list_first(obj->numa_nodes);
while (entry) {
cand_node = entry->data;
if (cand_node == node)
break;
entry = g_list_next(entry);
}
if (!entry)
obj->numa_nodes = g_list_append(obj->numa_nodes, node);
}
static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache,
int packageid,
cpumask_t package_mask,
int nodeid)
{
GList *entry;
struct topo_obj *package;
@ -165,10 +189,14 @@ static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache,
cache->parent = package;
}
if (nodeid > -1)
add_numa_node_to_topo_obj(package, nodeid);
return package;
}
static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu,
cpumask_t cache_mask)
cpumask_t cache_mask,
int nodeid)
{
GList *entry;
struct topo_obj *cache;
@ -208,6 +236,9 @@ static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu,
cpu->parent = (struct topo_obj *)cache;
}
if (nodeid > -1)
add_numa_node_to_topo_obj(cache, nodeid);
return cache;
}
@ -344,9 +375,9 @@ static void do_one_cpu(char *path)
cpus_and(cache_mask, cache_mask, unbanned_cpus);
cpus_and(package_mask, package_mask, unbanned_cpus);
cache = add_cpu_to_cache_domain(cpu, cache_mask);
package = add_cache_domain_to_package(cache, packageid, package_mask);
add_package_to_node(package, nodeid);
cache = add_cpu_to_cache_domain(cpu, cache_mask, nodeid);
package = add_cache_domain_to_package(cache, packageid, package_mask,
nodeid);
cpu->obj_type_list = &cpus;
cpus = g_list_append(cpus, cpu);
@ -368,12 +399,18 @@ static void dump_irq(struct irq_info *info, void *data)
free(indent);
}
static void dump_numa_node_num(struct topo_obj *p, void *data __attribute__((unused)))
{
log(TO_CONSOLE, LOG_INFO, "%d ", p->number);
}
static void dump_balance_obj(struct topo_obj *d, void *data __attribute__((unused)))
{
struct topo_obj *c = (struct topo_obj *)d;
log(TO_CONSOLE, LOG_INFO, "%s%s%s%sCPU number %i numa_node is %d (load %lu)\n",
log_indent, log_indent, log_indent, log_indent,
c->number, cpu_numa_node(c)->number , (unsigned long)c->load);
log(TO_CONSOLE, LOG_INFO, "%s%s%s%sCPU number %i numa_node is ",
log_indent, log_indent, log_indent, log_indent, c->number);
for_each_object(cpu_numa_node(c), dump_numa_node_num, NULL);
log(TO_CONSOLE, LOG_INFO, "(load %lu)\n", (unsigned long)c->load);
if (c->interrupts)
for_each_irq(c->interrupts, dump_irq, (void *)18);
}
@ -382,9 +419,11 @@ static void dump_cache_domain(struct topo_obj *d, void *data)
{
char *buffer = data;
cpumask_scnprintf(buffer, 4095, d->mask);
log(TO_CONSOLE, LOG_INFO, "%s%sCache domain %i: numa_node is %d cpu mask is %s (load %lu) \n",
log_indent, log_indent,
d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load);
log(TO_CONSOLE, LOG_INFO, "%s%sCache domain %i: numa_node is ",
log_indent, log_indent, d->number);
for_each_object(d->numa_nodes, dump_numa_node_num, NULL);
log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu) \n", buffer,
(unsigned long)d->load);
if (d->children)
for_each_object(d->children, dump_balance_obj, NULL);
if (g_list_length(d->interrupts) > 0)
@ -395,8 +434,10 @@ static void dump_package(struct topo_obj *d, void *data)
{
char *buffer = data;
cpumask_scnprintf(buffer, 4096, d->mask);
log(TO_CONSOLE, LOG_INFO, "Package %i: numa_node is %d cpu mask is %s (load %lu)\n",
d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load);
log(TO_CONSOLE, LOG_INFO, "Package %i: numa_node ", d->number);
for_each_object(d->numa_nodes, dump_numa_node_num, NULL);
log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu)\n",
buffer, (unsigned long)d->load);
if (d->children)
for_each_object(d->children, dump_cache_domain, buffer);
if (g_list_length(d->interrupts) > 0)
@ -448,9 +489,9 @@ void parse_cpu_tree(void)
char pad;
entry = readdir(dir);
/*
* We only want to count real cpus, not cpufreq and
* cpuidle
*/
* We only want to count real cpus, not cpufreq and
* cpuidle
*/
if (entry &&
sscanf(entry->d_name, "cpu%d%c", &num, &pad) == 1 &&
!strchr(entry->d_name, ' ')) {
@ -459,7 +500,8 @@ void parse_cpu_tree(void)
do_one_cpu(new_path);
}
} while (entry);
closedir(dir);
closedir(dir);
for_each_object(packages, connect_cpu_mem_topo, NULL);
if (debug_mode)
dump_tree();
@ -483,6 +525,7 @@ void clear_cpu_tree(void)
package = item->data;
g_list_free(package->children);
g_list_free(package->interrupts);
g_list_free(package->numa_nodes);
free(package);
packages = g_list_delete_link(packages, item);
}
@ -493,6 +536,7 @@ void clear_cpu_tree(void)
cache_domain = item->data;
g_list_free(cache_domain->children);
g_list_free(cache_domain->interrupts);
g_list_free(cache_domain->numa_nodes);
free(cache_domain);
cache_domains = g_list_delete_link(cache_domains, item);
}

View file

@ -193,9 +193,14 @@ static void parse_command_line(int argc, char **argv)
#endif
/*
* This builds our object tree. The Heirarchy is pretty straightforward
* This builds our object tree. The Heirarchy is typically pretty
* straightforward.
* At the top are numa_nodes
* All CPU packages belong to a single numa_node
* CPU packages belong to a single numa_node, unless the cache domains are in
* separate nodes. In that case, the cache domain's parent is the package, but
* the numa nodes point to the cache domains instead of the package as their
* children. This allows us to maintain the CPU hierarchy while adjusting for
* alternate memory topologies that are present on recent processor.
* All Cache domains belong to a CPU package
* All CPU cores belong to a cache domain
*

View file

@ -84,26 +84,13 @@ extern long HZ;
extern void build_numa_node_list(void);
extern void free_numa_node_list(void);
extern void dump_numa_node_info(struct topo_obj *node, void *data);
extern void add_package_to_node(struct topo_obj *p, int nodeid);
extern void connect_cpu_mem_topo(struct topo_obj *p, void *data);
extern struct topo_obj *get_numa_node(int nodeid);
/*
* Package functions
*/
#define package_numa_node(p) ((p)->parent)
/*
* cache_domain functions
*/
#define cache_domain_package(c) ((c)->parent)
#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c))))
/*
* cpu core functions
*/
#define cpu_cache_domain(cpu) ((cpu)->parent)
#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu))))
#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu)))))
#define cpu_numa_node(cpu) ((cpu)->parent->numa_nodes)
extern struct topo_obj *find_cpu_core(int cpunr);
extern int get_cpu_count(void);

28
numa.c
View file

@ -146,22 +146,38 @@ static gint compare_node(gconstpointer a, gconstpointer b)
return (ai->number == bi->number) ? 0 : 1;
}
void add_package_to_node(struct topo_obj *p, int nodeid)
void connect_cpu_mem_topo(struct topo_obj *p, void *data __attribute__((unused)))
{
GList *entry;
struct topo_obj *node;
struct topo_obj *lchild;
int len;
node = get_numa_node(nodeid);
len = g_list_length(p->numa_nodes);
if (!node) {
log(TO_CONSOLE, LOG_INFO, "Could not find numa node for node id %d\n", nodeid);
if (len == 0) {
return;
} else if (len > 1) {
for_each_object(p->children, connect_cpu_mem_topo, NULL);
return;
}
entry = g_list_first(p->numa_nodes);
node = entry->data;
if (!p->parent) {
node->children = g_list_append(node->children, p);
if (p->obj_type == OBJ_TYPE_PACKAGE && !p->parent)
p->parent = node;
entry = g_list_first(node->children);
while (entry) {
lchild = entry->data;
if (lchild == p)
break;
entry = g_list_next(entry);
}
if (!entry)
node->children = g_list_append(node->children, p);
}
void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused)))

View file

@ -54,6 +54,7 @@ struct topo_obj {
GList *interrupts;
struct topo_obj *parent;
GList *children;
GList *numa_nodes;
GList **obj_type_list;
};