From 229925e0359a82308af44e5f01d3a8ed311f4a6b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 27 Sep 2011 11:26:40 -0400 Subject: [PATCH 01/44] Add numa node list to irqbalance Step 1 of the new balance algorithm is to independently parse the numa nodes in sysfs, creating a list of the available nodes --- numa.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ types.h | 8 ++++++++ 2 files changed, 69 insertions(+) diff --git a/numa.c b/numa.c index 51ee88c..4abce36 100644 --- a/numa.c +++ b/numa.c @@ -33,6 +33,10 @@ #include "irqbalance.h" +#define SYSFS_NODE_PATH "sys/devices/system/node" + +GList *numa_nodes = NULL; + void pci_numa_scan(void) { int irq = -1; @@ -54,3 +58,60 @@ void pci_numa_scan(void) } while (irq != -1); } + +void add_one_node(const char *nodename) +{ + char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1); + struct numa_node *new; + char *cpustr; + FILE *f; + + if (!path) + return; + new = calloc(1, sizeof(struct numa_node)); + if (!new) + return; + sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); + f = fopen(path, "r"); + if (ferror(f)) { + cpus_clear(new->local_cpus); + } else { + fscanf(f, "%as", &cpustr); + if (!cpustr) { + cpus_clear(new->local_cpus); + } else { + cpumask_parse_user(cpustr, strlen(cpustr), new->local_cpus); + free(cpustr); + } + } + + new->number = strtoul(&nodename[4], NULL, 10); + numa_nodes = g_list_append(numa_nodes, new); +} + +void build_numa_node_list(void) +{ + DIR *dir = opendir(SYSFS_NODE_PATH); + struct dirent *entry; + + do { + entry = readdir(dir); + if (!entry) + break; + if ((entry->d_type == DT_DIR) && (strstr("node", entry->d_name))) { + add_one_node(entry->d_name); + } + } while (entry); +} + +static void free_numa_node(gpointer data) +{ + free(data); +} + +void free_numa_node_list(void) +{ + g_list_free_full(numa_nodes, free_numa_node); + numa_nodes = NULL; +} + diff --git a/types.h b/types.h index dc166e1..80a7cae 100644 --- a/types.h +++ b/types.h @@ -38,6 +38,14 @@ enum irq_prop { IRQ_MAX_PROPERTY }; +struct numa_node { + uint64_t workload; + int number; + cpumask_t local_cpus; + GList *packages; + GList *interrupts; +}; + struct package { uint64_t workload; int number; From 40a52df965b8491d091954e5194e3ec979a66675 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 27 Sep 2011 14:17:02 -0400 Subject: [PATCH 02/44] Add numa node references between packages and numa nodes Step 2 of the new balance algorithm is to create references between the numa node list in step 1 and the discovered packages during the cpu tree parsing --- cputree.c | 1 + irqbalance.c | 32 +++++++++++++++++++++++++++++ irqbalance.h | 10 +++++++++ numa.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--- types.h | 2 ++ 5 files changed, 99 insertions(+), 3 deletions(-) diff --git a/cputree.c b/cputree.c index c83f63f..63225dc 100644 --- a/cputree.c +++ b/cputree.c @@ -122,6 +122,7 @@ static void fill_packages(void) entry2 = g_list_next(entry2); } packages = g_list_append(packages, package); + add_package_to_node(package, search_numa_node(package->mask)); package_count++; } } diff --git a/irqbalance.c b/irqbalance.c index 8ae8197..453b6c5 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -96,6 +96,33 @@ static void parse_command_line(int argc, char **argv) } #endif +/* + * This builds our object tree. The Heirarchy is pretty straightforward + * At the top are numa_nodes + * All CPU packages belong to a single numa_node + * All Cache domains belong to a CPU package + * All CPU cores belong to a cache domain + * + * Objects are built in that order (top down) + * + * Object workload is the aggregate sum of the + * workload of the objects below it + */ +static void build_object_tree() +{ + build_numa_node_list(); +} + +static void free_object_tree() +{ + free_numa_node_list(); +} + +static void dump_object_tree() +{ + for_each_numa_node(dump_numa_node_info); +} + int main(int argc, char** argv) { @@ -125,6 +152,10 @@ int main(int argc, char** argv) } + build_object_tree(); + if (debug_mode) + dump_object_tree(); + rebuild_irq_db(); parse_cpu_tree(); @@ -199,5 +230,6 @@ int main(int argc, char** argv) break; counter++; } + free_object_tree(); return EXIT_SUCCESS; } diff --git a/irqbalance.h b/irqbalance.h index 3e76353..a63f1e7 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -51,4 +51,14 @@ void check_power_mode(void); void clear_cpu_tree(void); void pci_numa_scan(void); +/*===================NEW BALANCER FUNCTIONS============================*/ +/* + * Numa node access routines + */ +extern void build_numa_node_list(void); +extern void free_numa_node_list(void); +extern void dump_numa_node_info(struct numa_node *node); +extern void for_each_numa_node(void (*cb)(struct numa_node *node)); +extern void add_package_to_node(struct package *p, int nodeid); #endif + diff --git a/numa.c b/numa.c index 4abce36..6a83c23 100644 --- a/numa.c +++ b/numa.c @@ -33,7 +33,7 @@ #include "irqbalance.h" -#define SYSFS_NODE_PATH "sys/devices/system/node" +#define SYSFS_NODE_PATH "/sys/devices/system/node" GList *numa_nodes = NULL; @@ -59,7 +59,7 @@ void pci_numa_scan(void) } while (irq != -1); } -void add_one_node(const char *nodename) +static void add_one_node(const char *nodename) { char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1); struct numa_node *new; @@ -98,7 +98,7 @@ void build_numa_node_list(void) entry = readdir(dir); if (!entry) break; - if ((entry->d_type == DT_DIR) && (strstr("node", entry->d_name))) { + if ((entry->d_type == DT_DIR) && (strstr(entry->d_name, "node"))) { add_one_node(entry->d_name); } } while (entry); @@ -115,3 +115,54 @@ void free_numa_node_list(void) numa_nodes = NULL; } +static gint compare_node(gconstpointer a, gconstpointer b) +{ + const struct numa_node *ai = a; + const struct numa_node *bi = b; + + return (ai->number == bi->number) ? 0 : 1; +} + +void add_package_to_node(struct package *p, int nodeid) +{ + struct numa_node find, *node; + find.number = nodeid; + GList *entry; + + find.number = nodeid; + entry = g_list_find_custom(numa_nodes, &find, compare_node); + + if (!entry) { + if (debug_mode) + printf("Could not find numa node for node id %d\n", nodeid); + return; + } + + node = entry->data; + + node->packages = g_list_append(node->packages, p); + p->numa_node = node; +} + +void dump_numa_node_info(struct numa_node *node) +{ + char buffer[4096]; + + printf("NUMA NODE NUMBER: %d\n", node->number); + cpumask_scnprintf(buffer, 4096, node->local_cpus); + printf("LOCAL CPU MASK: %s\n", buffer); + printf("\n"); +} + +void for_each_numa_node(void(*cb)(struct numa_node *node)) +{ + GList *entry; + + entry = g_list_first(numa_nodes); + + while (entry) { + cb(entry->data); + entry = g_list_next(entry); + } +} + diff --git a/types.h b/types.h index 80a7cae..d6f0f3b 100644 --- a/types.h +++ b/types.h @@ -51,6 +51,8 @@ struct package { int number; cpumask_t mask; + struct numa_node *numa_node; + int node_num; int class_count[7]; From 3a81cc4dcf4fa9438ec4c3ad713a44c9e6a8baed Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 27 Sep 2011 14:32:32 -0400 Subject: [PATCH 03/44] Add references between cache domains and packages Step 3 of the new balance algorithm is to create references between the package list in step 2 and the cache domains during the cpu tree parsing, as well as the cpu_cores and the cache domains --- cputree.c | 1 + irqbalance.h | 4 ++++ types.h | 3 +++ 3 files changed, 8 insertions(+) diff --git a/cputree.c b/cputree.c index 63225dc..f6fafd2 100644 --- a/cputree.c +++ b/cputree.c @@ -116,6 +116,7 @@ static void fill_packages(void) if (cpus_equal(cache->package_mask, cache2->package_mask)) { cache2->marker = 1; package->cache_domains = g_list_append(package->cache_domains, cache2); + cache->package = package; if (package->number > cache2->number) package->number = cache2->number; } diff --git a/irqbalance.h b/irqbalance.h index a63f1e7..fe74182 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -60,5 +60,9 @@ extern void free_numa_node_list(void); extern void dump_numa_node_info(struct numa_node *node); extern void for_each_numa_node(void (*cb)(struct numa_node *node)); extern void add_package_to_node(struct package *p, int nodeid); + +/* + * Package functions + */ #endif diff --git a/types.h b/types.h index d6f0f3b..ad56b5c 100644 --- a/types.h +++ b/types.h @@ -68,6 +68,8 @@ struct cache_domain { int marker; int node_num; + struct package *package; + cpumask_t mask; cpumask_t package_mask; @@ -85,6 +87,7 @@ struct cpu_core { int marker; int node_num; + struct cache_domain *cache_domain; int class_count[7]; From a790e340b83ba950ddf3681ae377a48d777263e8 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 27 Sep 2011 16:07:28 -0400 Subject: [PATCH 04/44] Add some minor cleanup to the main loop to integrate the new object tree --- classify.c | 4 ++++ irqbalance.c | 12 ++++++------ irqbalance.h | 7 ++++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/classify.c b/classify.c index 6c55086..92b4bb4 100644 --- a/classify.c +++ b/classify.c @@ -254,6 +254,10 @@ done: return; } +void free_irq_db(void) +{ + g_list_free_full(interrupts_db, free_int); +} void rebuild_irq_db(void) { diff --git a/irqbalance.c b/irqbalance.c index 453b6c5..f341573 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -111,11 +111,15 @@ static void parse_command_line(int argc, char **argv) static void build_object_tree() { build_numa_node_list(); + parse_cpu_tree(); + rebuild_irq_db(); } static void free_object_tree() { free_numa_node_list(); + clear_cpu_tree(); + free_irq_db(); } static void dump_object_tree() @@ -156,10 +160,6 @@ int main(int argc, char** argv) if (debug_mode) dump_object_tree(); - rebuild_irq_db(); - - parse_cpu_tree(); - /* On single core UP systems irqbalance obviously has no work to do */ if (core_count<2) @@ -210,8 +210,8 @@ int main(int argc, char** argv) reset_counts(); clear_work_stats(); - clear_cpu_tree(); - parse_cpu_tree(); + free_object_tree(); + build_object_tree(); } calculate_workload(); diff --git a/irqbalance.h b/irqbalance.h index fe74182..dcfd682 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -29,7 +29,6 @@ extern GList *interrupts; extern void parse_cpu_tree(void); extern void clear_work_stats(void); extern void parse_proc_interrupts(void); -extern void rebuild_irq_db(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); extern int get_next_irq(int irq); @@ -64,5 +63,11 @@ extern void add_package_to_node(struct package *p, int nodeid); /* * Package functions */ + +/* + * irq db functions + */ +extern void rebuild_irq_db(void); +extern void free_irq_db(void); #endif From 157ad44ebd11019bb0c28ede12c08c9334d28276 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Sep 2011 10:59:10 -0400 Subject: [PATCH 05/44] Ad interrupt count property to irq db --- classify.c | 22 ++++++++++++++++++++++ irqbalance.h | 5 +++-- irqlist.c | 2 ++ types.h | 1 + 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/classify.c b/classify.c index 92b4bb4..f040b6d 100644 --- a/classify.c +++ b/classify.c @@ -84,6 +84,7 @@ static void init_new_irq(struct irq_info *new) new->property[IRQ_TYPE].itype = INT_TYPE; new->property[IRQ_NUMA].itype = INT_TYPE; new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE; + new->property[IRQ_INT_COUNT].itype = INT_TYPE; } static gint compare_ints(gconstpointer a, gconstpointer b) @@ -319,6 +320,27 @@ int find_irq_integer_prop(int irq, enum irq_prop prop) return result->property[prop].iint_val; } +int set_irq_integer_prop(int irq, enum irq_prop prop, int val) +{ + GList *entry; + struct irq_info find, *result; + + find.irq = irq; + + entry = g_list_find_custom(interrupts_db, &find, compare_ints); + + if (!entry) { + if (debug_mode) + printf("No entry for irq %d in the irq database, adding default entry\n", irq); + entry = add_misc_irq(irq); + } + + result = entry->data; + assert(result->property[prop].itype == INT_TYPE); + result->property[prop].iint_val = val; + return 0; +} + cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop) { GList *entry; diff --git a/irqbalance.h b/irqbalance.h index dcfd682..a273db8 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -32,8 +32,6 @@ extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); extern int get_next_irq(int irq); -extern int find_irq_integer_prop(int irq, enum irq_prop prop); -extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop); extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); @@ -69,5 +67,8 @@ extern void add_package_to_node(struct package *p, int nodeid); */ extern void rebuild_irq_db(void); extern void free_irq_db(void); +extern int set_irq_integer_prop(int irq, enum irq_prop prop, int val); +extern int find_irq_integer_prop(int irq, enum irq_prop prop); +extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop); #endif diff --git a/irqlist.c b/irqlist.c index 6ea48b1..7b4a436 100644 --- a/irqlist.c +++ b/irqlist.c @@ -191,6 +191,7 @@ void set_interrupt_count(int number, uint64_t count) if (irq->number == number) { irq->count = count; + set_irq_integer_prop(number, IRQ_INT_COUNT, count); /* see if affinity_hint changed */ get_affinity_hint(irq, number); return; @@ -207,6 +208,7 @@ void set_interrupt_count(int number, uint64_t count) irq->count = count; irq->allowed_mask = CPU_MASK_ALL; investigate(irq, number); + set_irq_integer_prop(number, IRQ_INT_COUNT, count); interrupts = g_list_append(interrupts, irq); } diff --git a/types.h b/types.h index ad56b5c..4546b36 100644 --- a/types.h +++ b/types.h @@ -35,6 +35,7 @@ enum irq_prop { IRQ_TYPE, IRQ_NUMA, IRQ_LCPU_MASK, + IRQ_INT_COUNT, IRQ_MAX_PROPERTY }; From 31b1ae2a4a00ed9149fecfd9c41a48a33c578c29 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Sep 2011 11:13:58 -0400 Subject: [PATCH 06/44] Clean up irq api to have a for_each_irq function --- classify.c | 13 +++++++++++++ irqbalance.h | 2 +- numa.c | 23 +++++++++++------------ 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/classify.c b/classify.c index f040b6d..dbd3f94 100644 --- a/classify.c +++ b/classify.c @@ -383,3 +383,16 @@ int get_next_irq(int irq) irqp= entry->data; return irqp->irq; } + +void for_each_irq(void (*cb)(int irq)) +{ + struct irq_info *info; + GList *entry = g_list_first(interrupts_db); + + while (entry) { + info = entry->data; + cb(info->irq); + entry = g_list_next(entry); + } +} + diff --git a/irqbalance.h b/irqbalance.h index a273db8..632ca6c 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -31,7 +31,6 @@ extern void clear_work_stats(void); extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); -extern int get_next_irq(int irq); extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); @@ -70,5 +69,6 @@ extern void free_irq_db(void); extern int set_irq_integer_prop(int irq, enum irq_prop prop, int val); extern int find_irq_integer_prop(int irq, enum irq_prop prop); extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop); +extern void for_each_irq(void (*cb)(int irq)); #endif diff --git a/numa.c b/numa.c index 6a83c23..9316dd2 100644 --- a/numa.c +++ b/numa.c @@ -37,26 +37,25 @@ GList *numa_nodes = NULL; -void pci_numa_scan(void) +static void set_irq_numa(int irq) { - int irq = -1; cpumask_t mask; int node_num; - do { - int type; - irq = get_next_irq(irq); - if (irq == -1) - break; + int type; - mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK); + mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK); - node_num = find_irq_integer_prop(irq, IRQ_NUMA); + node_num = find_irq_integer_prop(irq, IRQ_NUMA); - type = find_irq_integer_prop(irq, IRQ_CLASS); + type = find_irq_integer_prop(irq, IRQ_CLASS); - add_interrupt_numa(irq, mask, node_num, type); + add_interrupt_numa(irq, mask, node_num, type); - } while (irq != -1); +} + +void pci_numa_scan(void) +{ + for_each_irq(set_irq_numa); } static void add_one_node(const char *nodename) From 53d6405cb52b84dcb8efc26626e2ab83dafbddd7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Sep 2011 15:10:16 -0400 Subject: [PATCH 07/44] API: Some cleanup, moving cpumask_t's to the correct objects Some api cleanup, moving masks for cpus to their approrpiate corresponding objects, rather than placing them all with the cpus --- cputree.c | 172 ++++++++++++++++++++++++++++----------------------- irqbalance.h | 13 ++++ placement.c | 14 ++--- types.h | 7 --- 4 files changed, 115 insertions(+), 91 deletions(-) diff --git a/cputree.c b/cputree.c index f6fafd2..719cbad 100644 --- a/cputree.c +++ b/cputree.c @@ -88,91 +88,98 @@ static int search_numa_node(cpumask_t mask) return node_num; } -static void fill_packages(void) +static struct package* add_cache_domain_to_package(struct cache_domain *cache, + cpumask_t package_mask) { GList *entry; + struct package *package; + struct cache_domain *lcache; + + entry = g_list_first(packages); - entry = g_list_first(cache_domains); while (entry) { - struct package *package; - struct cache_domain *cache = NULL; - GList *entry2; - - cache = entry->data; - entry2 = entry; - entry = g_list_next(entry); - if (cache->marker) - continue; - package = malloc(sizeof(struct package)); - if (!package) + package = entry->data; + if (cpus_equal(package_mask, package->mask)) break; - memset(package, 0, sizeof(struct package)); - package->mask = cache->package_mask; - package->number = cache->number; - package->node_num = search_numa_node(package->mask); - while (entry2) { - struct cache_domain *cache2; - cache2 = entry2->data; - if (cpus_equal(cache->package_mask, cache2->package_mask)) { - cache2->marker = 1; - package->cache_domains = g_list_append(package->cache_domains, cache2); - cache->package = package; - if (package->number > cache2->number) - package->number = cache2->number; - } - entry2 = g_list_next(entry2); - } + entry = g_list_next(entry); + } + + if (!entry) { + package = calloc(sizeof(struct package), 1); + if (!package) + return NULL; + package->mask = package_mask; packages = g_list_append(packages, package); - add_package_to_node(package, search_numa_node(package->mask)); package_count++; } -} -static void fill_cache_domain(void) + entry = g_list_first(package->cache_domains); + while (entry) { + lcache = entry->data; + if (lcache == cache) + break; + entry = g_list_next(entry); + } + + if (!entry) { + package->cache_domains = g_list_append(package->cache_domains, cache); + cache->package = package; + } + + return package; +} +static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, + cpumask_t cache_mask) { GList *entry; + struct cache_domain *cache; + struct cpu_core *lcpu; + + entry = g_list_first(cache_domains); - entry = g_list_first(cpus); while (entry) { - struct cache_domain *cache = NULL; - struct cpu_core *cpu; - GList *entry2; - cpu = entry->data; - entry2 = entry; - entry = g_list_next(entry); - if (cpu->marker) - continue; - cache = malloc(sizeof(struct cache_domain)); - if (!cache) + cache = entry->data; + if (cpus_equal(cache_mask, cache->mask)) break; - memset(cache, 0, sizeof(struct cache_domain)); - cache->mask = cpu->cache_mask; - cache->package_mask = cpu->package_mask; - cache->number = cpu->number; - cache->node_num = search_numa_node(cache->mask); + entry = g_list_next(entry); + } + + if (!entry) { + cache = calloc(sizeof(struct cache_domain), 1); + if (!cache) + return NULL; + cache->mask = cache_mask; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; - while (entry2) { - struct cpu_core *cpu2; - cpu2 = entry2->data; - if (cpus_equal(cpu->cache_mask, cpu2->cache_mask) && - cpus_equal(cpu->package_mask, cpu2->package_mask)) { - cpu2->marker = 1; - cache->cpu_cores = g_list_append(cache->cpu_cores, cpu2); - if (cpu2->number < cache->number) - cache->number = cpu2->number; - } - entry2 = g_list_next(entry2); - } } + + entry = g_list_first(cache->cpu_cores); + while (entry) { + lcpu = entry->data; + if (lcpu == cpu) + break; + entry = g_list_next(entry); + } + + if (!entry) { + cache->cpu_cores = g_list_append(cache->cpu_cores, cpu); + cpu->cache_domain = cache; + } + + return cache; } - - + static void do_one_cpu(char *path) { struct cpu_core *cpu; FILE *file; char new_path[PATH_MAX]; + cpumask_t cache_mask, package_mask; + struct cache_domain *cache; + struct package *package; + DIR *dir; + struct dirent *entry; + int nodeid; /* skip offline cpus */ snprintf(new_path, PATH_MAX, "%s/online", path); @@ -201,9 +208,6 @@ static void do_one_cpu(char *path) cpu_set(cpu->number, cpu->mask); - /* set numa node of cpu */ - cpu->node_num = search_numa_node(cpu->mask); - /* if the cpu is on the banned list, just don't add it */ if (cpus_intersects(cpu->mask, banned_cpus)) { free(cpu); @@ -216,26 +220,26 @@ static void do_one_cpu(char *path) /* try to read the package mask; if it doesn't exist assume solitary */ snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path); file = fopen(new_path, "r"); - cpu_set(cpu->number, cpu->package_mask); + cpu_set(cpu->number, package_mask); if (file) { char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->package_mask); + cpumask_parse_user(line, strlen(line), package_mask); fclose(file); free(line); } /* try to read the cache mask; if it doesn't exist assume solitary */ /* We want the deepest cache level available so try index1 first, then index2 */ - cpu_set(cpu->number, cpu->cache_mask); + cpu_set(cpu->number, cache_mask); snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path); file = fopen(new_path, "r"); if (file) { char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->cache_mask); + cpumask_parse_user(line, strlen(line), cache_mask); fclose(file); free(line); } @@ -245,17 +249,34 @@ static void do_one_cpu(char *path) char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->cache_mask); + cpumask_parse_user(line, strlen(line), cache_mask); fclose(file); free(line); } + nodeid=0; + dir = opendir(path); + do { + entry = readdir(dir); + if (!entry) + break; + if (strstr(entry->d_name, "node")) { + nodeid = strtoul(&entry->d_name[4], NULL, 10); + break; + } + } while (entry); + closedir(dir); + + cache = add_cpu_to_cache_domain(cpu, cache_mask); + package = add_cache_domain_to_package(cache, package_mask); + add_package_to_node(package, nodeid); + /* blank out the banned cpus from the various masks so that interrupts will never be told to go there */ - cpus_and(cpu->cache_mask, cpu->cache_mask, unbanned_cpus); - cpus_and(cpu->package_mask, cpu->package_mask, unbanned_cpus); + cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus); + cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus); cpus_and(cpu->mask, cpu->mask, unbanned_cpus); cpus = g_list_append(cpus, cpu); @@ -286,7 +307,7 @@ void dump_tree(void) while (p_iter) { package = p_iter->data; cpumask_scnprintf(buffer, 4096, package->mask); - printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload); + printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package_numa_node(package)->number, buffer, (unsigned long)package->workload); c_iter = g_list_first(package->cache_domains); while (c_iter) { cache_domain = c_iter->data; @@ -297,7 +318,7 @@ void dump_tree(void) while (cp_iter) { cpu = cp_iter->data; cp_iter = g_list_next(cp_iter); - printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload); + printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu_numa_node(cpu)->number , (unsigned long)cpu->workload); dump_irqs(18, cpu->interrupts); } dump_irqs(10, cache_domain->interrupts); @@ -375,9 +396,6 @@ void parse_cpu_tree(void) } while (entry); closedir(dir); - fill_cache_domain(); - fill_packages(); - if (debug_mode) dump_tree(); diff --git a/irqbalance.h b/irqbalance.h index 632ca6c..d675ecb 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -60,6 +60,19 @@ extern void add_package_to_node(struct package *p, int nodeid); /* * Package functions */ +#define package_numa_node(p) ((p)->numa_node) +/* + * cache_domain functions + */ +#define cache_domain_package(c) ((c)->package) +#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) + +/* + * cpu core functions + */ +#define cpu_cache_domain(cpu) ((cpu)->cache_domain) +#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) +#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) /* * irq db functions diff --git a/placement.c b/placement.c index 828ce8e..6bc2411 100644 --- a/placement.c +++ b/placement.c @@ -42,9 +42,9 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package bonus = CROSS_PACKAGE_PENALTY; /* do a little numa affinity */ - if (irq->node_num != package->node_num) { - if (irq->node_num >= 0 && package->node_num >= 0) { - dist = numa_distance(irq->node_num, package->node_num); + if (irq->node_num != package_numa_node(package)->number) { + if (irq->node_num >= 0 && package_numa_node(package)->number >= 0) { + dist = numa_distance(irq->node_num, package_numa_node(package)->number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -118,9 +118,9 @@ static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) bonus = CROSS_PACKAGE_PENALTY/3; /* do a little numa affinity */ - if (irq->node_num != cpu->node_num) { - if (irq->node_num >= 0 && cpu->node_num >= 0) { - dist = numa_distance(irq->node_num, cpu->node_num); + if (irq->node_num != cpu_numa_node(cpu)->number) { + if (irq->node_num >= 0 && cpu_numa_node(cpu)->number >= 0) { + dist = numa_distance(irq->node_num, cpu_numa_node(cpu)->number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -134,7 +134,7 @@ static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) * since some chipsets only place at the first cpu, give a tiny preference to non-first * cpus for specifically placed interrupts */ - if (first_cpu(cpu->cache_mask)==cpu->number) + if (first_cpu(cpu_cache_domain(cpu)->mask)==cpu->number) bonus++; /* pay 6000 for each previous interrupt of the same class */ diff --git a/types.h b/types.h index 4546b36..469c2f8 100644 --- a/types.h +++ b/types.h @@ -54,8 +54,6 @@ struct package { cpumask_t mask; struct numa_node *numa_node; - int node_num; - int class_count[7]; GList *cache_domains; @@ -73,8 +71,6 @@ struct cache_domain { cpumask_t mask; - cpumask_t package_mask; - int class_count[7]; GList *cpu_cores; @@ -87,13 +83,10 @@ struct cpu_core { int number; int marker; - int node_num; struct cache_domain *cache_domain; int class_count[7]; - cpumask_t package_mask; - cpumask_t cache_mask; cpumask_t mask; GList *interrupts; From 90cf41fb911abd07a3040b850e5d1f8270a1a07a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Sep 2011 15:54:07 -0400 Subject: [PATCH 08/44] Remove unsued variables from irq struct and add last_count to new irq db --- classify.c | 4 ++++ irqlist.c | 7 +++---- types.h | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/classify.c b/classify.c index dbd3f94..a8cb640 100644 --- a/classify.c +++ b/classify.c @@ -83,8 +83,10 @@ static void init_new_irq(struct irq_info *new) new->property[IRQ_CLASS].itype = INT_TYPE; new->property[IRQ_TYPE].itype = INT_TYPE; new->property[IRQ_NUMA].itype = INT_TYPE; + new->property[IRQ_LEVEL].itype = INT_TYPE; new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE; new->property[IRQ_INT_COUNT].itype = INT_TYPE; + new->property[IRQ_LAST_INT_COUNT].itype = INT_TYPE; } static gint compare_ints(gconstpointer a, gconstpointer b) @@ -161,6 +163,8 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq) goto get_numa_node; new->property[IRQ_CLASS].iint_val = class_codes[class]; + new->property[IRQ_LEVEL].iint_val = map_class_to_level[class_codes[class]]; + get_numa_node: numa_node = -1; sprintf(path, "%s/numa_node", devpath); diff --git a/irqlist.c b/irqlist.c index 7b4a436..b5d1401 100644 --- a/irqlist.c +++ b/irqlist.c @@ -252,10 +252,9 @@ void calculate_workload(void) irq = item->data; item = g_list_next(item); - irq->workload = irq->count - irq->old_count + irq->workload/3 + irq->extra; + irq->workload = irq->count - irq->old_count + irq->workload/3; class_counts[irq->class]++; irq->old_count = irq->count; - irq->extra = 0; } } @@ -267,9 +266,9 @@ void reset_counts(void) while (item) { irq = item->data; item = g_list_next(item); + set_irq_integer_prop(irq->number, IRQ_LAST_INT_COUNT, + find_irq_integer_prop(irq->number, IRQ_INT_COUNT)); irq->old_count = irq->count; - irq->extra = 0; - } } diff --git a/types.h b/types.h index 469c2f8..d24dde8 100644 --- a/types.h +++ b/types.h @@ -34,8 +34,10 @@ enum irq_prop { IRQ_CLASS = 0, IRQ_TYPE, IRQ_NUMA, + IRQ_LEVEL, IRQ_LCPU_MASK, IRQ_INT_COUNT, + IRQ_LAST_INT_COUNT, IRQ_MAX_PROPERTY }; @@ -104,7 +106,6 @@ struct interrupt { uint64_t count; uint64_t old_count; - uint64_t extra; cpumask_t mask; cpumask_t old_mask; From 57159ea2f89b4925f2ee2a5cb1dc045dd9835ee1 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 30 Sep 2011 15:28:34 -0400 Subject: [PATCH 09/44] Migrate to use of irq_info and remove struct interrupt Migrate core workload calculation code to use new irq_info struct and for_each_* helper functions --- activate.c | 44 +++-- classify.c | 235 ++++++++++--------------- cputree.c | 110 ++++++++---- irqbalance.c | 10 +- irqbalance.h | 23 ++- irqlist.c | 256 +++------------------------- numa.c | 62 +++---- placement.c | 435 ++++++++++++++++++++++++----------------------- procinterrupts.c | 16 +- types.h | 40 ++--- 10 files changed, 499 insertions(+), 732 deletions(-) diff --git a/activate.c b/activate.c index 68c142c..8c267a7 100644 --- a/activate.c +++ b/activate.c @@ -32,30 +32,28 @@ #include "irqbalance.h" -void activate_mapping(void) +static void activate_mapping(struct irq_info *info, void *data __attribute__((unused))) { - struct interrupt *irq; - GList *iter; + char buf[PATH_MAX]; + FILE *file; - iter = g_list_first(interrupts); - while (iter) { - irq = iter->data; - iter = g_list_next(iter); + if (info->level == BALANCE_NONE) + return; + if (cpus_equal(info->mask, info->old_mask)) + return; - /* don't set the level if it's a NONE irq, or if there is - * no change */ - if (irq->balance_level != BALANCE_NONE && - !cpus_equal(irq->mask, irq->old_mask)) { - char buf[PATH_MAX]; - FILE *file; - sprintf(buf, "/proc/irq/%i/smp_affinity", irq->number); - file = fopen(buf, "w"); - if (!file) - continue; - cpumask_scnprintf(buf, PATH_MAX, irq->mask); - fprintf(file,"%s", buf); - fclose(file); - irq->old_mask = irq->mask; - } - } + sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); + file = fopen(buf, "w"); + if (!file) + return; + + cpumask_scnprintf(buf, PATH_MAX, info->mask); + fprintf(file, "%s", buf); + fclose(file); + info->old_mask = info->mask; +} + +void activate_mappings(void) +{ + for_each_irq(NULL, activate_mapping, NULL); } diff --git a/classify.c b/classify.c index a8cb640..dc5f05b 100644 --- a/classify.c +++ b/classify.c @@ -56,38 +56,6 @@ static short class_codes[MAX_CLASS] = { static GList *interrupts_db; #define SYSDEV_DIR "/sys/bus/pci/devices" -union property { - int int_val; - cpumask_t mask_val; -}; - -enum irq_type { - INT_TYPE = 0, - CPUMASK_TYPE, -}; - -struct irq_property { - enum irq_type itype; - union property iproperty; -}; -#define iint_val iproperty.int_val -#define imask_val iproperty.mask_val - -struct irq_info { - int irq; - struct irq_property property[IRQ_MAX_PROPERTY]; -}; - -static void init_new_irq(struct irq_info *new) -{ - new->property[IRQ_CLASS].itype = INT_TYPE; - new->property[IRQ_TYPE].itype = INT_TYPE; - new->property[IRQ_NUMA].itype = INT_TYPE; - new->property[IRQ_LEVEL].itype = INT_TYPE; - new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE; - new->property[IRQ_INT_COUNT].itype = INT_TYPE; - new->property[IRQ_LAST_INT_COUNT].itype = INT_TYPE; -} static gint compare_ints(gconstpointer a, gconstpointer b) { @@ -129,13 +97,12 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq) return NULL; } - new = malloc(sizeof(struct irq_info)); + new = calloc(sizeof(struct irq_info), 1); if (!new) return NULL; - init_new_irq(new); new->irq = irq; - new->property[IRQ_CLASS].iint_val = IRQ_OTHER; + new->class = IRQ_OTHER; interrupts_db = g_list_append(interrupts_db, new); @@ -162,8 +129,8 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq) if (class >= MAX_CLASS) goto get_numa_node; - new->property[IRQ_CLASS].iint_val = class_codes[class]; - new->property[IRQ_LEVEL].iint_val = map_class_to_level[class_codes[class]]; + new->class = class_codes[class]; + new->level = map_class_to_level[class_codes[class]]; get_numa_node: numa_node = -1; @@ -176,23 +143,39 @@ get_numa_node: fclose(fd); assign_node: - new->property[IRQ_NUMA].iint_val = numa_node; + new->numa_node = get_numa_node(numa_node); sprintf(path, "%s/local_cpus", devpath); fd = fopen(path, "r"); if (!fd) { - cpus_setall(new->property[IRQ_LCPU_MASK].imask_val); - goto out; + cpus_setall(new->cpumask); + goto assign_affinity_hint; } lcpu_mask = NULL; rc = fscanf(fd, "%as", &lcpu_mask); fclose(fd); if (!lcpu_mask) { - cpus_setall(new->property[IRQ_LCPU_MASK].imask_val); + cpus_setall(new->cpumask); } else { cpumask_parse_user(lcpu_mask, strlen(lcpu_mask), - new->property[IRQ_LCPU_MASK].imask_val); + new->cpumask); } + free(lcpu_mask); + +assign_affinity_hint: + cpus_clear(new->affinity_hint); + sprintf(path, "/proc/irq/%d/affinity_hint", irq); + fd = fopen(path, "r"); + if (!fd) + goto out; + lcpu_mask = NULL; + rc = fscanf(fd, "%as", &lcpu_mask); + fclose(fd); + if (!lcpu_mask) + goto out; + cpumask_parse_user(lcpu_mask, strlen(lcpu_mask), + new->affinity_hint); + free(lcpu_mask); out: if (debug_mode) printf("Adding IRQ %d to database\n", irq); @@ -230,7 +213,7 @@ static void build_one_dev_entry(const char *dirname) new = add_one_irq_to_db(path, irqnum); if (!new) continue; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX; + new->type = IRQ_TYPE_MSIX; } } while (entry != NULL); closedir(msidir); @@ -252,8 +235,9 @@ static void build_one_dev_entry(const char *dirname) new = add_one_irq_to_db(path, irqnum); if (!new) goto done; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY; + new->type = IRQ_TYPE_LEGACY; } + done: fclose(fd); return; @@ -286,117 +270,82 @@ void rebuild_irq_db(void) closedir(devdir); } -static GList *add_misc_irq(int irq) +struct irq_info *add_misc_irq(int irq) { - struct irq_info *new, find; + struct irq_info *new; - new = malloc(sizeof(struct irq_info)); + new = calloc(sizeof(struct irq_info), 1); if (!new) return NULL; - init_new_irq(new); new->irq = irq; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY; - new->property[IRQ_CLASS].iint_val = IRQ_OTHER; - new->property[IRQ_NUMA].iint_val = -1; + new->type = IRQ_TYPE_LEGACY; + new->class = IRQ_OTHER; + new->numa_node = get_numa_node(0); interrupts_db = g_list_append(interrupts_db, new); - find.irq = irq; - return g_list_find_custom(interrupts_db, &find, compare_ints); + return new; } -int find_irq_integer_prop(int irq, enum irq_prop prop) +void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data) { - GList *entry; - struct irq_info find, *result; - - find.irq = irq; - - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - - if (!entry) { - if (debug_mode) - printf("No entry for irq %d in the irq database, adding default entry\n", irq); - entry = add_misc_irq(irq); - } - - result = entry->data; - assert(result->property[prop].itype == INT_TYPE); - return result->property[prop].iint_val; -} - -int set_irq_integer_prop(int irq, enum irq_prop prop, int val) -{ - GList *entry; - struct irq_info find, *result; - - find.irq = irq; - - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - - if (!entry) { - if (debug_mode) - printf("No entry for irq %d in the irq database, adding default entry\n", irq); - entry = add_misc_irq(irq); - } - - result = entry->data; - assert(result->property[prop].itype == INT_TYPE); - result->property[prop].iint_val = val; - return 0; -} - -cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop) -{ - GList *entry; - struct irq_info find, *result; - - find.irq = irq; - - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - - if (!entry) { - if (debug_mode) - printf("No entry for irq %d in the irq database, adding default entry\n", irq); - entry = add_misc_irq(irq); - } - - result = entry->data; - assert(result->property[prop].itype == CPUMASK_TYPE); - return result->property[prop].imask_val; -} - -int get_next_irq(int irq) -{ - GList *entry; - struct irq_info *irqp, find; - - if (irq == -1) { - entry = g_list_first(interrupts_db); - irqp = entry->data; - return irqp->irq; - } - - find.irq = irq; - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - if (!entry) - return -1; - - entry = g_list_next(entry); - if (!entry) - return -1; - irqp= entry->data; - return irqp->irq; -} - -void for_each_irq(void (*cb)(int irq)) -{ - struct irq_info *info; - GList *entry = g_list_first(interrupts_db); + GList *entry = g_list_first(list ? list : interrupts_db); + GList *next; while (entry) { - info = entry->data; - cb(info->irq); - entry = g_list_next(entry); + next = g_list_next(entry); + cb(entry->data, data); + entry = next; } } +struct irq_info *get_irq_info(int irq) +{ + GList *entry; + struct irq_info find; + + find.irq = irq; + entry = g_list_find_custom(interrupts_db, &find, compare_ints); + return entry ? entry->data : NULL; +} + +void migrate_irq(GList **from, GList **to, struct irq_info *info) +{ + GList *entry; + struct irq_info find, *tmp;; + + if (from != NULL) { + find.irq = info->irq; + entry = g_list_find_custom(*from, &find, compare_ints); + tmp = entry->data; + *from = g_list_delete_link(*from, entry); + } else + tmp = info; + + + *to = g_list_append(*to, tmp); +} + +static gint sort_irqs(gconstpointer A, gconstpointer B) +{ + struct irq_info *a, *b; + a = (struct irq_info*)A; + b = (struct irq_info*)B; + + if (a->class < b->class) + return 1; + if (a->class > b->class) + return -1; + if (a->workload < b->workload) + return 1; + if (a->workload > b->workload) + return -1; + if (adata; - printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload); - dump_interrupts = g_list_next(dump_interrupts); - } + int spaces = (long int)data; + int i; + for (i=0; iirq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->workload); +} + +static void dump_cpu_core(struct cpu_core *c, void *data __attribute__((unused))) +{ + printf(" CPU number %i numa_node is %d (workload %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->workload); + if (c->interrupts) + for_each_irq(c->interrupts, dump_irq, (void *)18); +} + +static void dump_cache_domain(struct cache_domain *c, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4095, c->mask); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", c->number, cache_domain_numa_node(c)->number, buffer, (unsigned long)c->workload); + if (c->cpu_cores) + for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL); + if (c->interrupts) + for_each_irq(c->interrupts, dump_irq, (void *)10); +} + +static void dump_package(struct package *p, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4096, p->mask); + printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", p->number, package_numa_node(p)->number, buffer, (unsigned long)p->workload); + if (p->cache_domains) + for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer); + if (p->interrupts) + for_each_irq(p->interrupts, dump_irq, (void *)2); } void dump_tree(void) { - GList *p_iter, *c_iter, *cp_iter; - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - char buffer[4096]; - p_iter = g_list_first(packages); - while (p_iter) { - package = p_iter->data; - cpumask_scnprintf(buffer, 4096, package->mask); - printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package_numa_node(package)->number, buffer, (unsigned long)package->workload); - c_iter = g_list_first(package->cache_domains); - while (c_iter) { - cache_domain = c_iter->data; - c_iter = g_list_next(c_iter); - cpumask_scnprintf(buffer, 4095, cache_domain->mask); - printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload); - cp_iter = cache_domain->cpu_cores; - while (cp_iter) { - cpu = cp_iter->data; - cp_iter = g_list_next(cp_iter); - printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu_numa_node(cpu)->number , (unsigned long)cpu->workload); - dump_irqs(18, cpu->interrupts); - } - dump_irqs(10, cache_domain->interrupts); - } - dump_irqs(2, package->interrupts); - p_iter = g_list_next(p_iter); - } + for_each_package(NULL, dump_package, buffer); } /* @@ -444,3 +442,41 @@ void clear_cpu_tree(void) core_count = 0; } + + +void for_each_package(GList *list, void (*cb)(struct package *p, void *data), void *data) +{ + GList *entry = g_list_first(list ? list : packages); + GList *next; + + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + +void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c, void *data), void *data) +{ + GList *entry = g_list_first(list ? list : cache_domains); + GList *next; + + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + +void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), void *data) +{ + GList *entry = g_list_first(list ? list : cpus); + GList *next; + + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + diff --git a/irqbalance.c b/irqbalance.c index f341573..4d76054 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -124,7 +124,7 @@ static void free_object_tree() static void dump_object_tree() { - for_each_numa_node(dump_numa_node_info); + for_each_numa_node(NULL, dump_numa_node_info, NULL); } int main(int argc, char** argv) @@ -185,7 +185,6 @@ int main(int argc, char** argv) sleep(SLEEP_INTERVAL/4); reset_counts(); parse_proc_interrupts(); - pci_numa_scan(); calculate_workload(); sort_irq_list(); if (debug_mode) @@ -216,13 +215,8 @@ int main(int argc, char** argv) calculate_workload(); - /* to cope with dynamic configurations we scan for new numa information - * once every 5 minutes - */ - pci_numa_scan(); - calculate_placement(); - activate_mapping(); + activate_mappings(); if (debug_mode) dump_tree(); diff --git a/irqbalance.h b/irqbalance.h index d675ecb..39fa5d2 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -32,7 +32,6 @@ extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); -extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); void calculate_workload(void); void reset_counts(void); @@ -41,7 +40,7 @@ void sort_irq_list(void); void calculate_placement(void); void dump_tree(void); -void activate_mapping(void); +void activate_mappings(void); void account_for_nic_stats(void); void check_power_mode(void); void clear_cpu_tree(void); @@ -53,19 +52,23 @@ void pci_numa_scan(void); */ extern void build_numa_node_list(void); extern void free_numa_node_list(void); -extern void dump_numa_node_info(struct numa_node *node); -extern void for_each_numa_node(void (*cb)(struct numa_node *node)); +extern void dump_numa_node_info(struct numa_node *node, void *data); +extern void for_each_numa_node(GList *list, void (*cb)(struct numa_node *node, void *data), void *data); extern void add_package_to_node(struct package *p, int nodeid); +extern struct numa_node *get_numa_node(int nodeid); /* * Package functions */ #define package_numa_node(p) ((p)->numa_node) +extern void for_each_package(GList *list, void (*cb)(struct package *p, void *data), void *data); + /* * cache_domain functions */ #define cache_domain_package(c) ((c)->package) #define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) +extern void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c, void *data), void *data); /* * cpu core functions @@ -73,15 +76,19 @@ extern void add_package_to_node(struct package *p, int nodeid); #define cpu_cache_domain(cpu) ((cpu)->cache_domain) #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) +extern void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), void *data); /* * irq db functions */ extern void rebuild_irq_db(void); extern void free_irq_db(void); -extern int set_irq_integer_prop(int irq, enum irq_prop prop, int val); -extern int find_irq_integer_prop(int irq, enum irq_prop prop); -extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop); -extern void for_each_irq(void (*cb)(int irq)); +extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data); +extern struct irq_info *get_irq_info(int irq); +extern void migrate_irq(GList **from, GList **to, struct irq_info *info); +extern struct irq_info *add_misc_irq(int irq); + +#define irq_numa_node(irq) ((irq)->numa_node) + #endif diff --git a/irqlist.c b/irqlist.c index b5d1401..9a8bffd 100644 --- a/irqlist.c +++ b/irqlist.c @@ -33,11 +33,9 @@ #include "types.h" #include "irqbalance.h" -GList *interrupts; - -void get_affinity_hint(struct interrupt *irq, int number) +void get_affinity_hint(struct irq_info *irq, int number) { char buf[PATH_MAX]; cpumask_t tempmask; @@ -55,260 +53,46 @@ void get_affinity_hint(struct interrupt *irq, int number) } cpumask_parse_user(line, strlen(line), tempmask); if (!__cpus_full(&tempmask, num_possible_cpus())) - irq->node_mask = tempmask; + irq->affinity_hint = tempmask; fclose(file); free(line); } -/* - * This function classifies and reads various things from /proc about a specific irq - */ -static void investigate(struct interrupt *irq, int number) +void build_workload(struct irq_info *info, void *unused __attribute__((unused))) { - DIR *dir; - struct dirent *entry; - char *c, *c2; - int nr , count = 0, can_set = 1; - char buf[PATH_MAX]; - sprintf(buf, "/proc/irq/%i", number); - dir = opendir(buf); - do { - entry = readdir(dir); - if (!entry) - break; - if (strcmp(entry->d_name,"smp_affinity")==0) { - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/smp_affinity", number); - file = fopen(buf, "r+"); - if (!file) - continue; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - continue; - } - cpumask_parse_user(line, strlen(line), irq->mask); - /* - * Check that we can write the affinity, if - * not take it out of the list. - */ - fputs(line, file); - if (fclose(file) && errno == EIO) - can_set = 0; - free(line); - } else if (strcmp(entry->d_name,"allowed_affinity")==0) { - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/allowed_affinity", number); - file = fopen(buf, "r"); - if (!file) - continue; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - continue; - } - cpumask_parse_user(line, strlen(line), irq->allowed_mask); - fclose(file); - free(line); - } else if (strcmp(entry->d_name,"affinity_hint")==0) { - get_affinity_hint(irq, number); - } else { - irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS); - } - - } while (entry); - closedir(dir); - irq->balance_level = map_class_to_level[irq->class]; - - for (nr = 0; nr < NR_CPUS; nr++) - if (cpu_isset(nr, irq->allowed_mask)) - count++; - - /* if there is no choice in the allowed mask, don't bother to balance */ - if ((count<2) || (can_set == 0)) - irq->balance_level = BALANCE_NONE; - - - /* next, check the IRQBALANCE_BANNED_INTERRUPTS env variable for blacklisted irqs */ - c = c2 = getenv("IRQBALANCE_BANNED_INTERRUPTS"); - if (!c) - return; - - do { - c = c2; - nr = strtoul(c, &c2, 10); - if (c!=c2 && nr == number) - irq->balance_level = BALANCE_NONE; - } while (c!=c2 && c2!=NULL); -} - -/* Set numa node number for MSI interrupt; - * Assumes existing irq metadata - */ -void set_msi_interrupt_numa(int number) -{ - GList *item; - struct interrupt *irq; - int node; - - node = find_irq_integer_prop(number, IRQ_NUMA); - if (node < 0) - return; - - item = g_list_first(interrupts); - while (item) { - irq = item->data; - - if (irq->number == number) { - irq->node_num = node; - irq->msi = 1; - return; - } - item = g_list_next(item); - } -} - -/* - * Set the number of interrupts received for a specific irq; - * create the irq metadata if there is none yet - */ -void set_interrupt_count(int number, uint64_t count) -{ - GList *item; - struct interrupt *irq; - - if (count < MIN_IRQ_COUNT && !one_shot_mode) - return; /* no need to track or set interrupts sources without any activity since boot - but allow for a few (20) boot-time-only interrupts */ - - item = g_list_first(interrupts); - while (item) { - irq = item->data; - - if (irq->number == number) { - irq->count = count; - set_irq_integer_prop(number, IRQ_INT_COUNT, count); - /* see if affinity_hint changed */ - get_affinity_hint(irq, number); - return; - } - item = g_list_next(item); - } - /* new interrupt */ - irq = malloc(sizeof(struct interrupt)); - if (!irq) - return; - memset(irq, 0, sizeof(struct interrupt)); - irq->node_num = -1; - irq->number = number; - irq->count = count; - irq->allowed_mask = CPU_MASK_ALL; - investigate(irq, number); - set_irq_integer_prop(number, IRQ_INT_COUNT, count); - interrupts = g_list_append(interrupts, irq); -} - -/* - * Set the numa affinity mask for a specific interrupt if there - * is metadata for the interrupt; do nothing if no such data - * exists. - */ -void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type) -{ - GList *item; - struct interrupt *irq; - - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - - if (irq->number == number) { - cpus_or(irq->numa_mask, irq->numa_mask, mask); - irq->node_num = node_num; - if (irq->class < type && irq->balance_level != BALANCE_NONE) { - irq->class = type; - irq->balance_level = map_class_to_level[irq->class]; - } - return; - } - } + info->workload = info->irq_count - info->last_irq_count + info->workload/3; + class_counts[info->class]++; + info->last_irq_count = info->irq_count; } void calculate_workload(void) { int i; - GList *item; - struct interrupt *irq; for (i=0; i<7; i++) class_counts[i]=0; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); + for_each_irq(NULL, build_workload, NULL); +} - irq->workload = irq->count - irq->old_count + irq->workload/3; - class_counts[irq->class]++; - irq->old_count = irq->count; - } +static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused))) +{ + info->last_irq_count = info->irq_count; + info->irq_count = 0; } void reset_counts(void) { - GList *item; - struct interrupt *irq; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - set_irq_integer_prop(irq->number, IRQ_LAST_INT_COUNT, - find_irq_integer_prop(irq->number, IRQ_INT_COUNT)); - irq->old_count = irq->count; - } + for_each_irq(NULL, reset_irq_count, NULL); +} + + +static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) +{ + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->workload); } void dump_workloads(void) { - GList *item; - struct interrupt *irq; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - - printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload); - - } + for_each_irq(NULL, dump_workload, NULL); } - -static gint sort_irqs(gconstpointer A, gconstpointer B) -{ - struct interrupt *a, *b; - a = (struct interrupt*)A; - b = (struct interrupt*)B; - - if (a->class < b->class) - return 1; - if (a->class > b->class) - return -1; - if (a->workload < b->workload) - return 1; - if (a->workload > b->workload) - return -1; - if (alow) and then by workload (high->low) */ - interrupts = g_list_sort(interrupts, sort_irqs); -} diff --git a/numa.c b/numa.c index 9316dd2..19817b1 100644 --- a/numa.c +++ b/numa.c @@ -37,26 +37,13 @@ GList *numa_nodes = NULL; -static void set_irq_numa(int irq) -{ - cpumask_t mask; - int node_num; - int type; - - mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK); - - node_num = find_irq_integer_prop(irq, IRQ_NUMA); - - type = find_irq_integer_prop(irq, IRQ_CLASS); - - add_interrupt_numa(irq, mask, node_num, type); - -} - -void pci_numa_scan(void) -{ - for_each_irq(set_irq_numa); -} +struct numa_node unspecified_node = { + .workload = 0, + .number = -1, + .mask = CPU_MASK_ALL, + .packages = NULL, + .interrupts = NULL, +}; static void add_one_node(const char *nodename) { @@ -73,13 +60,13 @@ static void add_one_node(const char *nodename) sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); f = fopen(path, "r"); if (ferror(f)) { - cpus_clear(new->local_cpus); + cpus_clear(new->mask); } else { fscanf(f, "%as", &cpustr); if (!cpustr) { - cpus_clear(new->local_cpus); + cpus_clear(new->mask); } else { - cpumask_parse_user(cpustr, strlen(cpustr), new->local_cpus); + cpumask_parse_user(cpustr, strlen(cpustr), new->mask); free(cpustr); } } @@ -143,25 +130,40 @@ void add_package_to_node(struct package *p, int nodeid) p->numa_node = node; } -void dump_numa_node_info(struct numa_node *node) +void dump_numa_node_info(struct numa_node *node, void *unused __attribute__((unused))) { char buffer[4096]; printf("NUMA NODE NUMBER: %d\n", node->number); - cpumask_scnprintf(buffer, 4096, node->local_cpus); + cpumask_scnprintf(buffer, 4096, node->mask); printf("LOCAL CPU MASK: %s\n", buffer); printf("\n"); } -void for_each_numa_node(void(*cb)(struct numa_node *node)) +void for_each_numa_node(GList *list, void(*cb)(struct numa_node *node, void *data), void *data) { - GList *entry; + GList *entry, *next; - entry = g_list_first(numa_nodes); + entry = g_list_first(list ? list : numa_nodes); while (entry) { - cb(entry->data); - entry = g_list_next(entry); + next = g_list_next(entry); + cb(entry->data, data); + entry = next; } } +struct numa_node *get_numa_node(int nodeid) +{ + struct numa_node find; + GList *entry; + + if (nodeid == -1) + return &unspecified_node; + + find.number = nodeid; + + entry = g_list_find_custom(numa_nodes, &find, compare_node); + return entry ? entry->data : NULL; +} + diff --git a/placement.c b/placement.c index 6bc2411..3f969b5 100644 --- a/placement.c +++ b/placement.c @@ -30,9 +30,9 @@ int power_mode; -extern GList *interrupts, *packages, *cache_domains, *cpus; +extern GList *packages, *cache_domains, *cpus; -static uint64_t package_cost_func(struct interrupt *irq, struct package *package) +static uint64_t package_cost_func(struct irq_info *irq, struct package *package) { int bonus = 0; int maxcount; @@ -42,9 +42,9 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package bonus = CROSS_PACKAGE_PENALTY; /* do a little numa affinity */ - if (irq->node_num != package_numa_node(package)->number) { - if (irq->node_num >= 0 && package_numa_node(package)->number >= 0) { - dist = numa_distance(irq->node_num, package_numa_node(package)->number); + if (irq_numa_node(irq)->number != package_numa_node(package)->number) { + if (irq_numa_node(irq)->number >= 0 && package_numa_node(package)->number >= 0) { + dist = numa_distance(irq_numa_node(irq)->number, package_numa_node(package)->number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -63,14 +63,10 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package if (package->class_count[irq->class]>=maxcount && !power_mode) bonus += 300000; - /* if the package has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, package->mask)) - bonus += 600000; - return irq->workload + bonus; } -static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain) +static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain *cache_domain) { int bonus = 0; int dist; @@ -80,9 +76,9 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai bonus = CROSS_PACKAGE_PENALTY/2; /* do a little numa affinity */ - if (irq->node_num != cache_domain->node_num) { - if (irq->node_num >= 0 && cache_domain->node_num >= 0) { - dist = numa_distance(irq->node_num, cache_domain->node_num); + if (irq_numa_node(irq)->number != cache_domain->node_num) { + if (irq_numa_node(irq)->number >= 0 && cache_domain->node_num >= 0) { + dist = numa_distance(irq_numa_node(irq)->number, cache_domain->node_num); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -98,17 +94,14 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai /* try to avoid having a lot of MSI interrupt (globally, no by devide id) on * cache domain */ - if (irq->msi == 1) + if ((irq->type == IRQ_TYPE_MSI) || (irq->type == IRQ_TYPE_MSIX)) bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class]; - /* if the cache domain has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, cache_domain->mask)) - bonus += 600000; return irq->workload + bonus; } -static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) +static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) { int bonus = 0; int dist; @@ -118,9 +111,9 @@ static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) bonus = CROSS_PACKAGE_PENALTY/3; /* do a little numa affinity */ - if (irq->node_num != cpu_numa_node(cpu)->number) { - if (irq->node_num >= 0 && cpu_numa_node(cpu)->number >= 0) { - dist = numa_distance(irq->node_num, cpu_numa_node(cpu)->number); + if (irq_numa_node(irq)->number != cpu_numa_node(cpu)->number) { + if (irq_numa_node(irq)->number >= 0 && cpu_numa_node(cpu)->number >= 0) { + dist = numa_distance(irq_numa_node(irq)->number, cpu_numa_node(cpu)->number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -140,245 +133,259 @@ static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) /* pay 6000 for each previous interrupt of the same class */ bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class]; - /* if the core has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, cpu->mask)) - bonus += 600000; - return irq->workload + bonus; } +struct cache_domain_placement { + struct irq_info *info; + struct cache_domain *best; + uint64_t best_cost; +}; -static void place_cache_domain(struct package *package) +static void find_best_cd(struct cache_domain *c, void *data) { - GList *iter, *next; - GList *pkg; - struct interrupt *irq; - struct cache_domain *cache_domain; + struct cache_domain_placement *best = data; + uint64_t newload; - - iter = g_list_first(package->interrupts); - while (iter) { - struct cache_domain *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; - - if (irq->balance_level <= BALANCE_PACKAGE) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(package->cache_domains); - while (pkg) { - uint64_t newload; - - cache_domain = pkg->data; - newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain); - if (newload < best_cost) { - best = cache_domain; - best_cost = newload; - } - - pkg = g_list_next(pkg); - } - if (best) { - next = g_list_next(iter); - package->interrupts = g_list_delete_link(package->interrupts, iter); - - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - iter = next; - } else - iter = g_list_next(iter); + newload = c->workload + cache_domain_cost_func(best->info, c); + if (newload < best->best_cost) { + best->best = c; + best->best_cost = newload; } +} + +static void place_irq_in_cache_domain(struct irq_info *info, void *data) +{ + struct package *p = data; + struct cache_domain_placement place; + + if (info->level <= BALANCE_PACKAGE) + return; + + place.best_cost = INT_MAX; + place.best = NULL; + place.info = info; + + for_each_cache_domain(p->cache_domains, find_best_cd, &place); + + if (place.best) { + migrate_irq(&p->interrupts, &place.best->interrupts, info); + info->assigned_obj = place.best; + place.best->class_count[info->class]++; + info->mask = place.best->mask; + } + +} + +static void place_cache_domain(struct package *package, void *data __attribute__((unused))) +{ + if (package->interrupts) + for_each_irq(package->interrupts, place_irq_in_cache_domain, package); } -static void place_core(struct cache_domain *cache_domain) +struct core_placement { + struct cpu_core *best; + uint64_t best_cost; + struct irq_info *info; +}; + +static void place_irq_in_core(struct cpu_core *c, void *data) { - GList *iter, *next; - GList *pkg; - struct interrupt *irq; - struct cpu_core *cpu; + struct core_placement *best = data; + uint64_t newload; - - iter = g_list_first(cache_domain->interrupts); - while (iter) { - struct cpu_core *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; - - /* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */ - if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(cache_domain->cpu_cores); - while (pkg) { - uint64_t newload; - - cpu = pkg->data; - newload = cpu->workload + cpu_cost_func(irq, cpu); - if (newload < best_cost) { - best = cpu; - best_cost = newload; - } - - pkg = g_list_next(pkg); - } - if (best) { - next = g_list_next(iter); - cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter); - - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - iter = next; - } else - iter = g_list_next(iter); + newload = c->workload + cpu_cost_func(best->info, c); + if (newload < best->best_cost) { + best->best = c; + best->best_cost = newload; } } - -static void place_packages(GList *list) +static void place_core(struct irq_info *info, void *data) { - GList *iter; - GList *pkg; - struct interrupt *irq; - struct package *package; + struct cache_domain *c = data; + struct core_placement place; + if ((info->level <= BALANCE_CACHE) && + (!one_shot_mode)) + return; - iter = g_list_first(list); - while (iter) { - struct package *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; - if (irq->balance_level == BALANCE_NONE) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(packages); - while (pkg) { - uint64_t newload; + place.info = info; + place.best = NULL; + place.best_cost = INT_MAX; - package = pkg->data; - newload = package->workload + package_cost_func(irq, package); - if (newload < best_cost) { - best = package; - best_cost = newload; - } + for_each_cpu_core(c->cpu_cores, place_irq_in_core, &place); - pkg = g_list_next(pkg); - } - if (best) { - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - } - iter = g_list_next(iter); + if (place.best) { + migrate_irq(&c->interrupts, &place.best->interrupts, info); + info->assigned_obj = place.best; + place.best->workload += info->workload + 1; + info->mask = place.best->mask; + } + +} + +static void place_cores(struct cache_domain *cache_domain, void *data __attribute__((unused))) +{ + if (cache_domain->interrupts) + for_each_irq(cache_domain->interrupts, place_core, cache_domain); +} + +struct package_placement { + struct irq_info *info; + struct package *best; + uint64_t best_cost; +}; + +static void find_best_package(struct package *p, void *data) +{ + uint64_t newload; + struct package_placement *place = data; + + newload = p->workload + package_cost_func(place->info, p); + if (newload < place->best_cost) { + place->best = p; + place->best_cost = newload; } } - -static void place_affinity_hint(GList *list) +static void place_irq_in_package(struct irq_info *info, void *unused __attribute__((unused))) { - /* still need to balance best workload within the affinity_hint mask */ - GList *iter; - struct interrupt *irq; + struct package_placement place; - iter = g_list_first(list); - while (iter) { - irq = iter->data; - if (irq->balance_level == BALANCE_NONE) { - iter = g_list_next(iter); - continue; - } - if ((!cpus_empty(irq->node_mask)) && - (!cpus_equal(irq->mask, irq->node_mask)) && - (!__cpus_full(&irq->node_mask, num_possible_cpus()))) { - irq->old_mask = irq->mask; - irq->mask = irq->node_mask; - } + if (info->level == BALANCE_NONE) + return; - iter = g_list_next(iter); + place.best_cost = INT_MAX; + place.best = NULL; + place.info = info; + + for_each_package(NULL, find_best_package, &place); + + if (place.best) { + migrate_irq(NULL, &place.best->interrupts, info); + info->assigned_obj = place.best; + place.best->workload += info->workload + 1; + place.best->class_count[info->class]++; + info->mask = place.best->mask; } } +static void place_irq_affinity_hint(struct irq_info *info, void *data __attribute__((unused))) +{ + + if (info->level == BALANCE_NONE) + return; + + if ((!cpus_empty(irq_numa_node(info)->mask)) && + (!cpus_equal(info->mask, irq_numa_node(info)->mask)) && + (!__cpus_full(&irq_numa_node(info)->mask, num_possible_cpus()))) { + info->old_mask = info->mask; + info->mask = irq_numa_node(info)->mask; + } +} + +static void place_affinity_hint(void) +{ + for_each_irq(NULL, place_irq_affinity_hint, NULL); +} + + +static void check_cpu_irq_route(struct cpu_core *c, void *data) +{ + struct irq_info *info = data; + + if (cpus_intersects(c->mask, irq_numa_node(info)->mask) || + cpus_intersects(c->mask, info->mask)) + c->workload += info->workload; +} + +static void check_cd_irq_route(struct cache_domain *c, void *data) +{ + struct irq_info *info = data; + + if (cpus_intersects(c->mask, irq_numa_node(info)->mask) || + cpus_intersects(c->mask, info->mask)) + c->workload += info->workload; +} + +static void check_package_irq_route(struct package *p, void *data) +{ + struct irq_info *info = data; + + if (cpus_intersects(p->mask, irq_numa_node(info)->mask) || + cpus_intersects(p->mask, info->mask)) + p->workload += info->workload; +} + +static void check_irq_route(struct irq_info *info, void *data __attribute__((unused))) +{ + + if (info->level != BALANCE_NONE) + return; + + for_each_package(NULL, check_package_irq_route, info); + for_each_cache_domain(NULL, check_cd_irq_route, info); + for_each_cpu_core(NULL, check_cpu_irq_route, info); +} static void do_unroutables(void) { - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - struct interrupt *irq; - GList *iter, *inter; - - inter = g_list_first(interrupts); - while (inter) { - irq = inter->data; - inter = g_list_next(inter); - if (irq->balance_level != BALANCE_NONE) - continue; - - iter = g_list_first(packages); - while (iter) { - package = iter->data; - if (cpus_intersects(package->mask, irq->node_mask) || - cpus_intersects(package->mask, irq->mask)) - package->workload += irq->workload; - iter = g_list_next(iter); - } - - iter = g_list_first(cache_domains); - while (iter) { - cache_domain = iter->data; - if (cpus_intersects(cache_domain->mask, irq->node_mask) - || cpus_intersects(cache_domain->mask, irq->mask)) - cache_domain->workload += irq->workload; - iter = g_list_next(iter); - } - iter = g_list_first(cpus); - while (iter) { - cpu = iter->data; - if (cpus_intersects(cpu->mask, irq->node_mask) || - cpus_intersects(cpu->mask, irq->mask)) - cpu->workload += irq->workload; - iter = g_list_next(iter); - } - } + for_each_irq(NULL, check_irq_route, NULL); } +static void validate_irq(struct irq_info *info, void *data) +{ + printf("Validating irq %d %p against %p\n", info->irq, info->assigned_obj, data); + if (info->assigned_obj != data) + printf("irq %d is wrong, points to %p, should be %p\n", + info->irq, info->assigned_obj, data); +} + +static void validate_package(struct package *p, void *data __attribute__((unused))) +{ + if (p->interrupts) + for_each_irq(p->interrupts, validate_irq, p); +} + +static void validate_cd(struct cache_domain *c, void *data __attribute__((unused))) +{ + if (c->interrupts) + for_each_irq(c->interrupts, validate_irq, c); +} + +static void validate_cpu(struct cpu_core *c, void *data __attribute__((unused))) +{ + if (c->interrupts) + for_each_irq(c->interrupts, validate_irq, c); +} + +static void validate_object_tree_placement() +{ + for_each_package(NULL, validate_package, NULL); + for_each_cache_domain(NULL, validate_cd, NULL); + for_each_cpu_core(NULL, validate_cpu, NULL); +} void calculate_placement(void) { - struct package *package; - struct cache_domain *cache_domain; - GList *iter; /* first clear old data */ clear_work_stats(); + sort_irq_list(); do_unroutables(); - place_packages(interrupts); - iter = g_list_first(packages); - while (iter) { - package = iter->data; - place_cache_domain(package); - iter = g_list_next(iter); - } + for_each_irq(NULL, place_irq_in_package, NULL); + for_each_package(NULL, place_cache_domain, NULL); + for_each_cache_domain(NULL, place_cores, NULL); - iter = g_list_first(cache_domains); - while (iter) { - cache_domain = iter->data; - place_core(cache_domain); - iter = g_list_next(iter); - } /* * if affinity_hint is populated on irq and is not set to * all CPUs (meaning it's initialized), honor that above * anything in the package locality/workload. */ - place_affinity_hint(interrupts); + place_affinity_hint(); + if (debug_mode) + validate_object_tree_placement(); } diff --git a/procinterrupts.c b/procinterrupts.c index cd76903..094a7e1 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -39,7 +39,6 @@ void parse_proc_interrupts(void) FILE *file; char *line = NULL; size_t size = 0; - int int_type; file = fopen("/proc/interrupts", "r"); if (!file) @@ -56,6 +55,7 @@ void parse_proc_interrupts(void) int number; uint64_t count; char *c, *c2; + struct irq_info *info; if (getline(&line, &size, file)==0) break; @@ -73,6 +73,10 @@ void parse_proc_interrupts(void) *c = 0; c++; number = strtoul(line, NULL, 10); + info = get_irq_info(number); + if (!info) + info = add_misc_irq(number); + count = 0; cpunr = 0; @@ -89,17 +93,11 @@ void parse_proc_interrupts(void) if (cpunr != core_count) need_cpu_rescan = 1; - set_interrupt_count(number, count); + info->irq_count = count; /* is interrupt MSI based? */ - int_type = find_irq_integer_prop(number, IRQ_TYPE); - if ((int_type == IRQ_TYPE_MSI) || (int_type == IRQ_TYPE_MSIX)) { + if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX)) msi_found_in_sysfs = 1; - /* Set numa node for irq if it was MSI */ - if (debug_mode) - printf("Set MSI interrupt for %d\n", number); - set_msi_interrupt_numa(number); - } } if ((proc_int_has_msi) && (!msi_found_in_sysfs)) { syslog(LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n"); diff --git a/types.h b/types.h index d24dde8..fe7cf12 100644 --- a/types.h +++ b/types.h @@ -44,7 +44,7 @@ enum irq_prop { struct numa_node { uint64_t workload; int number; - cpumask_t local_cpus; + cpumask_t mask; GList *packages; GList *interrupts; }; @@ -94,29 +94,21 @@ struct cpu_core { GList *interrupts; }; -struct interrupt { - uint64_t workload; - - int balance_level; - - int number; - int class; - int node_num; - int msi; - - uint64_t count; - uint64_t old_count; - - cpumask_t mask; - cpumask_t old_mask; - - - cpumask_t numa_mask; - cpumask_t allowed_mask; - - /* user/driver provided for smarter balancing */ - cpumask_t node_mask; +struct irq_info { + int irq; + int class; + int type; + int level; + struct numa_node *numa_node; + cpumask_t cpumask; + cpumask_t affinity_hint; + cpumask_t mask; /*this will go away soon*/ + cpumask_t old_mask; /*this will go away soon*/ + uint64_t irq_count; + uint64_t last_irq_count; + uint64_t workload; + int moved; + void *assigned_obj; }; - #endif From c80a1db73ef3e5515fefdb5ec55d8926f471e3a1 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 13:38:35 -0400 Subject: [PATCH 10/44] Remove unneeded enum irq_prop --- types.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/types.h b/types.h index fe7cf12..bde3dd0 100644 --- a/types.h +++ b/types.h @@ -30,17 +30,6 @@ /* * IRQ properties */ -enum irq_prop { - IRQ_CLASS = 0, - IRQ_TYPE, - IRQ_NUMA, - IRQ_LEVEL, - IRQ_LCPU_MASK, - IRQ_INT_COUNT, - IRQ_LAST_INT_COUNT, - IRQ_MAX_PROPERTY -}; - struct numa_node { uint64_t workload; int number; From e53524aa9ab54ad8fb2f40607b992ceaba03d11f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 14:08:25 -0400 Subject: [PATCH 11/44] Merge common fields of objects to a single struct Numa nodes, pacakges, cache domains and cores have lots of common fields Merge those and place the common struct at the head of each field so code may have the opportunity to tread each object as a generic type. --- cputree.c | 107 ++++++++++++++++++---------------------------------- irqlist.c | 2 +- numa.c | 30 ++++++++------- placement.c | 96 +++++++++++++++++++++++----------------------- types.h | 43 ++++++--------------- 5 files changed, 113 insertions(+), 165 deletions(-) diff --git a/cputree.c b/cputree.c index 4986814..7d99eb1 100644 --- a/cputree.c +++ b/cputree.c @@ -55,39 +55,6 @@ cpumask_t cpu_possible_map; */ static cpumask_t unbanned_cpus; -static int search_numa_node(cpumask_t mask) -{ - int node_num, ret; - struct bitmask *node_mask; - cpumask_t cpu_node_mask; - - node_num = numa_num_configured_nodes(); - - if (node_num < 1) - return -1; - - node_mask = numa_allocate_cpumask(); - - node_num--; /* indexing from zero */ - - while (node_num >= 0) { - ret = numa_node_to_cpus(node_num, node_mask); - if (ret) { - node_num--; - continue; - } - memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long)); - if (cpus_intersects(mask, cpu_node_mask)) { - numa_free_cpumask(node_mask); - return node_num; - } - node_num--; - } - - numa_free_cpumask(node_mask); - return node_num; -} - static struct package* add_cache_domain_to_package(struct cache_domain *cache, cpumask_t package_mask) { @@ -99,7 +66,7 @@ static struct package* add_cache_domain_to_package(struct cache_domain *cache, while (entry) { package = entry->data; - if (cpus_equal(package_mask, package->mask)) + if (cpus_equal(package_mask, package->common.mask)) break; entry = g_list_next(entry); } @@ -108,7 +75,7 @@ static struct package* add_cache_domain_to_package(struct cache_domain *cache, package = calloc(sizeof(struct package), 1); if (!package) return NULL; - package->mask = package_mask; + package->common.mask = package_mask; packages = g_list_append(packages, package); package_count++; } @@ -139,7 +106,7 @@ static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, while (entry) { cache = entry->data; - if (cpus_equal(cache_mask, cache->mask)) + if (cpus_equal(cache_mask, cache->common.mask)) break; entry = g_list_next(entry); } @@ -148,7 +115,7 @@ static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, cache = calloc(sizeof(struct cache_domain), 1); if (!cache) return NULL; - cache->mask = cache_mask; + cache->common.mask = cache_mask; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; } @@ -202,14 +169,14 @@ static void do_one_cpu(char *path) return; memset(cpu, 0, sizeof(struct cpu_core)); - cpu->number = strtoul(&path[27], NULL, 10); + cpu->common.number = strtoul(&path[27], NULL, 10); - cpu_set(cpu->number, cpu_possible_map); + cpu_set(cpu->common.number, cpu_possible_map); - cpu_set(cpu->number, cpu->mask); + cpu_set(cpu->common.number, cpu->common.mask); /* if the cpu is on the banned list, just don't add it */ - if (cpus_intersects(cpu->mask, banned_cpus)) { + if (cpus_intersects(cpu->common.mask, banned_cpus)) { free(cpu); /* even though we don't use the cpu we do need to count it */ core_count++; @@ -220,7 +187,7 @@ static void do_one_cpu(char *path) /* try to read the package mask; if it doesn't exist assume solitary */ snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path); file = fopen(new_path, "r"); - cpu_set(cpu->number, package_mask); + cpu_set(cpu->common.number, package_mask); if (file) { char *line = NULL; size_t size = 0; @@ -232,7 +199,7 @@ static void do_one_cpu(char *path) /* try to read the cache mask; if it doesn't exist assume solitary */ /* We want the deepest cache level available so try index1 first, then index2 */ - cpu_set(cpu->number, cache_mask); + cpu_set(cpu->common.number, cache_mask); snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path); file = fopen(new_path, "r"); if (file) { @@ -275,9 +242,9 @@ static void do_one_cpu(char *path) blank out the banned cpus from the various masks so that interrupts will never be told to go there */ - cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus); - cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus); - cpus_and(cpu->mask, cpu->mask, unbanned_cpus); + cpus_and(cpu_cache_domain(cpu)->common.mask, cpu_cache_domain(cpu)->common.mask, unbanned_cpus); + cpus_and(cpu_package(cpu)->common.mask, cpu_package(cpu)->common.mask, unbanned_cpus); + cpus_and(cpu->common.mask, cpu->common.mask, unbanned_cpus); cpus = g_list_append(cpus, cpu); core_count++; @@ -288,36 +255,36 @@ static void dump_irq(struct irq_info *info, void *data) int spaces = (long int)data; int i; for (i=0; iirq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->workload); + printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->workload); } static void dump_cpu_core(struct cpu_core *c, void *data __attribute__((unused))) { - printf(" CPU number %i numa_node is %d (workload %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->workload); - if (c->interrupts) - for_each_irq(c->interrupts, dump_irq, (void *)18); + printf(" CPU number %i numa_node is %d (workload %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.workload); + if (c->common.interrupts) + for_each_irq(c->common.interrupts, dump_irq, (void *)18); } static void dump_cache_domain(struct cache_domain *c, void *data) { char *buffer = data; - cpumask_scnprintf(buffer, 4095, c->mask); - printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", c->number, cache_domain_numa_node(c)->number, buffer, (unsigned long)c->workload); + cpumask_scnprintf(buffer, 4095, c->common.mask); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.workload); if (c->cpu_cores) for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL); - if (c->interrupts) - for_each_irq(c->interrupts, dump_irq, (void *)10); + if (c->common.interrupts) + for_each_irq(c->common.interrupts, dump_irq, (void *)10); } static void dump_package(struct package *p, void *data) { char *buffer = data; - cpumask_scnprintf(buffer, 4096, p->mask); - printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", p->number, package_numa_node(p)->number, buffer, (unsigned long)p->workload); + cpumask_scnprintf(buffer, 4096, p->common.mask); + printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.workload); if (p->cache_domains) for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer); - if (p->interrupts) - for_each_irq(p->interrupts, dump_irq, (void *)2); + if (p->common.interrupts) + for_each_irq(p->common.interrupts, dump_irq, (void *)2); } void dump_tree(void) @@ -341,25 +308,25 @@ void clear_work_stats(void) p_iter = g_list_first(packages); while (p_iter) { package = p_iter->data; - package->workload = 0; - g_list_free(package->interrupts); - package->interrupts = NULL; + package->common.workload = 0; + g_list_free(package->common.interrupts); + package->common.interrupts = NULL; c_iter = g_list_first(package->cache_domains); memset(package->class_count, 0, sizeof(package->class_count)); while (c_iter) { cache_domain = c_iter->data; c_iter = g_list_next(c_iter); - cache_domain->workload = 0; + cache_domain->common.workload = 0; cp_iter = cache_domain->cpu_cores; - g_list_free(cache_domain->interrupts); - cache_domain->interrupts = NULL; + g_list_free(cache_domain->common.interrupts); + cache_domain->common.interrupts = NULL; memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count)); while (cp_iter) { cpu = cp_iter->data; cp_iter = g_list_next(cp_iter); - cpu->workload = 0; - g_list_free(cpu->interrupts); - cpu->interrupts = NULL; + cpu->common.workload = 0; + g_list_free(cpu->common.interrupts); + cpu->common.interrupts = NULL; memset(cpu->class_count, 0, sizeof(cpu->class_count)); } } @@ -415,7 +382,7 @@ void clear_cpu_tree(void) item = g_list_first(packages); package = item->data; g_list_free(package->cache_domains); - g_list_free(package->interrupts); + g_list_free(package->common.interrupts); free(package); packages = g_list_delete_link(packages, item); } @@ -425,7 +392,7 @@ void clear_cpu_tree(void) item = g_list_first(cache_domains); cache_domain = item->data; g_list_free(cache_domain->cpu_cores); - g_list_free(cache_domain->interrupts); + g_list_free(cache_domain->common.interrupts); free(cache_domain); cache_domains = g_list_delete_link(cache_domains, item); } @@ -435,7 +402,7 @@ void clear_cpu_tree(void) while (cpus) { item = g_list_first(cpus); cpu = item->data; - g_list_free(cpu->interrupts); + g_list_free(cpu->common.interrupts); free(cpu); cpus = g_list_delete_link(cpus, item); } diff --git a/irqlist.c b/irqlist.c index 9a8bffd..35ef317 100644 --- a/irqlist.c +++ b/irqlist.c @@ -88,7 +88,7 @@ void reset_counts(void) static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) { - printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->workload); + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->workload); } void dump_workloads(void) diff --git a/numa.c b/numa.c index 19817b1..1173529 100644 --- a/numa.c +++ b/numa.c @@ -38,11 +38,13 @@ GList *numa_nodes = NULL; struct numa_node unspecified_node = { - .workload = 0, - .number = -1, - .mask = CPU_MASK_ALL, + .common = { + .workload = 0, + .number = -1, + .mask = CPU_MASK_ALL, + .interrupts = NULL, + }, .packages = NULL, - .interrupts = NULL, }; static void add_one_node(const char *nodename) @@ -60,18 +62,18 @@ static void add_one_node(const char *nodename) sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); f = fopen(path, "r"); if (ferror(f)) { - cpus_clear(new->mask); + cpus_clear(new->common.mask); } else { fscanf(f, "%as", &cpustr); if (!cpustr) { - cpus_clear(new->mask); + cpus_clear(new->common.mask); } else { - cpumask_parse_user(cpustr, strlen(cpustr), new->mask); + cpumask_parse_user(cpustr, strlen(cpustr), new->common.mask); free(cpustr); } } - new->number = strtoul(&nodename[4], NULL, 10); + new->common.number = strtoul(&nodename[4], NULL, 10); numa_nodes = g_list_append(numa_nodes, new); } @@ -106,16 +108,16 @@ static gint compare_node(gconstpointer a, gconstpointer b) const struct numa_node *ai = a; const struct numa_node *bi = b; - return (ai->number == bi->number) ? 0 : 1; + return (ai->common.number == bi->common.number) ? 0 : 1; } void add_package_to_node(struct package *p, int nodeid) { struct numa_node find, *node; - find.number = nodeid; + find.common.number = nodeid; GList *entry; - find.number = nodeid; + find.common.number = nodeid; entry = g_list_find_custom(numa_nodes, &find, compare_node); if (!entry) { @@ -134,8 +136,8 @@ void dump_numa_node_info(struct numa_node *node, void *unused __attribute__((unu { char buffer[4096]; - printf("NUMA NODE NUMBER: %d\n", node->number); - cpumask_scnprintf(buffer, 4096, node->mask); + printf("NUMA NODE NUMBER: %d\n", node->common.number); + cpumask_scnprintf(buffer, 4096, node->common.mask); printf("LOCAL CPU MASK: %s\n", buffer); printf("\n"); } @@ -161,7 +163,7 @@ struct numa_node *get_numa_node(int nodeid) if (nodeid == -1) return &unspecified_node; - find.number = nodeid; + find.common.number = nodeid; entry = g_list_find_custom(numa_nodes, &find, compare_node); return entry ? entry->data : NULL; diff --git a/placement.c b/placement.c index 3f969b5..bb493f5 100644 --- a/placement.c +++ b/placement.c @@ -38,13 +38,13 @@ static uint64_t package_cost_func(struct irq_info *irq, struct package *package) int maxcount; int dist; /* moving to a cold package/cache/etc gets you a 3000 penalty */ - if (!cpus_intersects(irq->old_mask, package->mask)) + if (!cpus_intersects(irq->old_mask, package->common.mask)) bonus = CROSS_PACKAGE_PENALTY; /* do a little numa affinity */ - if (irq_numa_node(irq)->number != package_numa_node(package)->number) { - if (irq_numa_node(irq)->number >= 0 && package_numa_node(package)->number >= 0) { - dist = numa_distance(irq_numa_node(irq)->number, package_numa_node(package)->number); + if (irq_numa_node(irq)->common.number != package_numa_node(package)->common.number) { + if (irq_numa_node(irq)->common.number >= 0 && package_numa_node(package)->common.number >= 0) { + dist = numa_distance(irq_numa_node(irq)->common.number, package_numa_node(package)->common.number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -56,7 +56,7 @@ static uint64_t package_cost_func(struct irq_info *irq, struct package *package) /* in power save mode, you better be on package 0, with overflow to the next package if really needed */ if (power_mode) - bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number; + bonus += POWER_MODE_PACKAGE_THRESHOLD * package->common.number; /* if we're out of whack in terms of per class counts.. just block (except in power mode) */ maxcount = (class_counts[irq->class] + package_count -1 ) / package_count; @@ -72,13 +72,13 @@ static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain int dist; /* moving to a cold cache gets you a 1500 penalty */ - if (!cpus_intersects(irq->old_mask, cache_domain->mask)) + if (!cpus_intersects(irq->old_mask, cache_domain->common.mask)) bonus = CROSS_PACKAGE_PENALTY/2; /* do a little numa affinity */ - if (irq_numa_node(irq)->number != cache_domain->node_num) { - if (irq_numa_node(irq)->number >= 0 && cache_domain->node_num >= 0) { - dist = numa_distance(irq_numa_node(irq)->number, cache_domain->node_num); + if (irq_numa_node(irq)->common.number != cache_domain_numa_node(cache_domain)->common.number) { + if (irq_numa_node(irq)->common.number >= 0 && cache_domain_numa_node(cache_domain)->common.number >= 0) { + dist = numa_distance(irq_numa_node(irq)->common.number, cache_domain_numa_node(cache_domain)->common.number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -107,13 +107,13 @@ static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) int dist; /* moving to a colder core gets you a 1000 penalty */ - if (!cpus_intersects(irq->old_mask, cpu->mask)) + if (!cpus_intersects(irq->old_mask, cpu->common.mask)) bonus = CROSS_PACKAGE_PENALTY/3; /* do a little numa affinity */ - if (irq_numa_node(irq)->number != cpu_numa_node(cpu)->number) { - if (irq_numa_node(irq)->number >= 0 && cpu_numa_node(cpu)->number >= 0) { - dist = numa_distance(irq_numa_node(irq)->number, cpu_numa_node(cpu)->number); + if (irq_numa_node(irq)->common.number != cpu_numa_node(cpu)->common.number) { + if (irq_numa_node(irq)->common.number >= 0 && cpu_numa_node(cpu)->common.number >= 0) { + dist = numa_distance(irq_numa_node(irq)->common.number, cpu_numa_node(cpu)->common.number); /* moving to a distant numa node results into penalty */ bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; } @@ -127,7 +127,7 @@ static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) * since some chipsets only place at the first cpu, give a tiny preference to non-first * cpus for specifically placed interrupts */ - if (first_cpu(cpu_cache_domain(cpu)->mask)==cpu->number) + if (first_cpu(cpu_cache_domain(cpu)->common.mask)==cpu->common.number) bonus++; /* pay 6000 for each previous interrupt of the same class */ @@ -147,7 +147,7 @@ static void find_best_cd(struct cache_domain *c, void *data) struct cache_domain_placement *best = data; uint64_t newload; - newload = c->workload + cache_domain_cost_func(best->info, c); + newload = c->common.workload + cache_domain_cost_func(best->info, c); if (newload < best->best_cost) { best->best = c; best->best_cost = newload; @@ -169,18 +169,18 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) for_each_cache_domain(p->cache_domains, find_best_cd, &place); if (place.best) { - migrate_irq(&p->interrupts, &place.best->interrupts, info); + migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info); info->assigned_obj = place.best; place.best->class_count[info->class]++; - info->mask = place.best->mask; + info->mask = place.best->common.mask; } } static void place_cache_domain(struct package *package, void *data __attribute__((unused))) { - if (package->interrupts) - for_each_irq(package->interrupts, place_irq_in_cache_domain, package); + if (package->common.interrupts) + for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package); } @@ -195,7 +195,7 @@ static void place_irq_in_core(struct cpu_core *c, void *data) struct core_placement *best = data; uint64_t newload; - newload = c->workload + cpu_cost_func(best->info, c); + newload = c->common.workload + cpu_cost_func(best->info, c); if (newload < best->best_cost) { best->best = c; best->best_cost = newload; @@ -218,18 +218,18 @@ static void place_core(struct irq_info *info, void *data) for_each_cpu_core(c->cpu_cores, place_irq_in_core, &place); if (place.best) { - migrate_irq(&c->interrupts, &place.best->interrupts, info); + migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info); info->assigned_obj = place.best; - place.best->workload += info->workload + 1; - info->mask = place.best->mask; + place.best->common.workload += info->workload + 1; + info->mask = place.best->common.mask; } } static void place_cores(struct cache_domain *cache_domain, void *data __attribute__((unused))) { - if (cache_domain->interrupts) - for_each_irq(cache_domain->interrupts, place_core, cache_domain); + if (cache_domain->common.interrupts) + for_each_irq(cache_domain->common.interrupts, place_core, cache_domain); } struct package_placement { @@ -243,7 +243,7 @@ static void find_best_package(struct package *p, void *data) uint64_t newload; struct package_placement *place = data; - newload = p->workload + package_cost_func(place->info, p); + newload = p->common.workload + package_cost_func(place->info, p); if (newload < place->best_cost) { place->best = p; place->best_cost = newload; @@ -264,11 +264,11 @@ static void place_irq_in_package(struct irq_info *info, void *unused __attribute for_each_package(NULL, find_best_package, &place); if (place.best) { - migrate_irq(NULL, &place.best->interrupts, info); + migrate_irq(NULL, &place.best->common.interrupts, info); info->assigned_obj = place.best; - place.best->workload += info->workload + 1; + place.best->common.workload += info->workload + 1; place.best->class_count[info->class]++; - info->mask = place.best->mask; + info->mask = place.best->common.mask; } } @@ -278,11 +278,11 @@ static void place_irq_affinity_hint(struct irq_info *info, void *data __attribut if (info->level == BALANCE_NONE) return; - if ((!cpus_empty(irq_numa_node(info)->mask)) && - (!cpus_equal(info->mask, irq_numa_node(info)->mask)) && - (!__cpus_full(&irq_numa_node(info)->mask, num_possible_cpus()))) { + if ((!cpus_empty(irq_numa_node(info)->common.mask)) && + (!cpus_equal(info->mask, irq_numa_node(info)->common.mask)) && + (!__cpus_full(&irq_numa_node(info)->common.mask, num_possible_cpus()))) { info->old_mask = info->mask; - info->mask = irq_numa_node(info)->mask; + info->mask = irq_numa_node(info)->common.mask; } } @@ -296,27 +296,27 @@ static void check_cpu_irq_route(struct cpu_core *c, void *data) { struct irq_info *info = data; - if (cpus_intersects(c->mask, irq_numa_node(info)->mask) || - cpus_intersects(c->mask, info->mask)) - c->workload += info->workload; + if (cpus_intersects(c->common.mask, irq_numa_node(info)->common.mask) || + cpus_intersects(c->common.mask, info->mask)) + c->common.workload += info->workload; } static void check_cd_irq_route(struct cache_domain *c, void *data) { struct irq_info *info = data; - if (cpus_intersects(c->mask, irq_numa_node(info)->mask) || - cpus_intersects(c->mask, info->mask)) - c->workload += info->workload; + if (cpus_intersects(c->common.mask, irq_numa_node(info)->common.mask) || + cpus_intersects(c->common.mask, info->mask)) + c->common.workload += info->workload; } static void check_package_irq_route(struct package *p, void *data) { struct irq_info *info = data; - if (cpus_intersects(p->mask, irq_numa_node(info)->mask) || - cpus_intersects(p->mask, info->mask)) - p->workload += info->workload; + if (cpus_intersects(p->common.mask, irq_numa_node(info)->common.mask) || + cpus_intersects(p->common.mask, info->mask)) + p->common.workload += info->workload; } static void check_irq_route(struct irq_info *info, void *data __attribute__((unused))) @@ -345,20 +345,20 @@ static void validate_irq(struct irq_info *info, void *data) static void validate_package(struct package *p, void *data __attribute__((unused))) { - if (p->interrupts) - for_each_irq(p->interrupts, validate_irq, p); + if (p->common.interrupts) + for_each_irq(p->common.interrupts, validate_irq, p); } static void validate_cd(struct cache_domain *c, void *data __attribute__((unused))) { - if (c->interrupts) - for_each_irq(c->interrupts, validate_irq, c); + if (c->common.interrupts) + for_each_irq(c->common.interrupts, validate_irq, c); } static void validate_cpu(struct cpu_core *c, void *data __attribute__((unused))) { - if (c->interrupts) - for_each_irq(c->interrupts, validate_irq, c); + if (c->common.interrupts) + for_each_irq(c->common.interrupts, validate_irq, c); } static void validate_object_tree_placement() diff --git a/types.h b/types.h index bde3dd0..1cc993f 100644 --- a/types.h +++ b/types.h @@ -27,60 +27,39 @@ #define IRQ_TYPE_MSIX 2 -/* - * IRQ properties - */ +struct common_obj_data { + uint64_t workload; + int number; + cpumask_t mask; + GList *interrupts; +}; + struct numa_node { - uint64_t workload; - int number; - cpumask_t mask; + struct common_obj_data common; GList *packages; - GList *interrupts; }; struct package { - uint64_t workload; - int number; - - cpumask_t mask; + struct common_obj_data common; struct numa_node *numa_node; - int class_count[7]; - GList *cache_domains; - GList *interrupts; }; struct cache_domain { - uint64_t workload; - int number; - + struct common_obj_data common; int marker; - int node_num; - struct package *package; - - cpumask_t mask; - int class_count[7]; - GList *cpu_cores; - GList *interrupts; }; struct cpu_core { - uint64_t workload; - int number; - + struct common_obj_data common; int marker; struct cache_domain *cache_domain; - int class_count[7]; - - cpumask_t mask; - - GList *interrupts; }; struct irq_info { From e8b40b53c56b08ababc1f194f8691125e0c078b8 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 14:36:43 -0400 Subject: [PATCH 12/44] Build a list of irqs to be migrated Currently we re-examine all irqs each iteration. instead we should build a list of irqs we want to move, and only rebalance those. Currently we still rebalance all irqs every iteration, but this will soon give us a chance to be more selective than that. --- cputree.c | 6 ------ irqbalance.c | 11 +++++++++++ irqbalance.h | 4 +--- placement.c | 10 +++++----- types.h | 2 +- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/cputree.c b/cputree.c index 7d99eb1..e0ba10e 100644 --- a/cputree.c +++ b/cputree.c @@ -309,8 +309,6 @@ void clear_work_stats(void) while (p_iter) { package = p_iter->data; package->common.workload = 0; - g_list_free(package->common.interrupts); - package->common.interrupts = NULL; c_iter = g_list_first(package->cache_domains); memset(package->class_count, 0, sizeof(package->class_count)); while (c_iter) { @@ -318,15 +316,11 @@ void clear_work_stats(void) c_iter = g_list_next(c_iter); cache_domain->common.workload = 0; cp_iter = cache_domain->cpu_cores; - g_list_free(cache_domain->common.interrupts); - cache_domain->common.interrupts = NULL; memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count)); while (cp_iter) { cpu = cp_iter->data; cp_iter = g_list_next(cp_iter); cpu->common.workload = 0; - g_list_free(cpu->common.interrupts); - cpu->common.interrupts = NULL; memset(cpu->class_count, 0, sizeof(cpu->class_count)); } } diff --git a/irqbalance.c b/irqbalance.c index 4d76054..3ab3bee 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -127,6 +127,13 @@ static void dump_object_tree() for_each_numa_node(NULL, dump_numa_node_info, NULL); } +static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + info->moved = 1; + migrate_irq((info->assigned_obj ? &info->assigned_obj->interrupts : NULL), + &rebalance_irq_list, info); +} + int main(int argc, char** argv) { @@ -190,6 +197,8 @@ int main(int argc, char** argv) if (debug_mode) dump_workloads(); + for_each_irq(NULL, force_rebalance_irq, NULL); + while (1) { sleep_approx(SLEEP_INTERVAL); if (debug_mode) @@ -223,6 +232,8 @@ int main(int argc, char** argv) if (one_shot_mode) break; counter++; + + for_each_irq(NULL, force_rebalance_irq, NULL); } free_object_tree(); return EXIT_SUCCESS; diff --git a/irqbalance.h b/irqbalance.h index 39fa5d2..a1833ba 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -12,8 +12,6 @@ #include "types.h" #include -struct interrupt; - extern int package_count; extern int cache_domain_count; extern int core_count; @@ -24,7 +22,6 @@ extern int debug_mode; extern int power_mode; extern int need_cpu_rescan; extern int one_shot_mode; -extern GList *interrupts; extern void parse_cpu_tree(void); extern void clear_work_stats(void); @@ -32,6 +29,7 @@ extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); +extern GList *rebalance_irq_list; void calculate_workload(void); void reset_counts(void); diff --git a/placement.c b/placement.c index bb493f5..45006b6 100644 --- a/placement.c +++ b/placement.c @@ -30,7 +30,7 @@ int power_mode; -extern GList *packages, *cache_domains, *cpus; +GList *rebalance_irq_list; static uint64_t package_cost_func(struct irq_info *irq, struct package *package) { @@ -170,7 +170,7 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) if (place.best) { migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info); - info->assigned_obj = place.best; + info->assigned_obj = (struct common_obj_data *)place.best; place.best->class_count[info->class]++; info->mask = place.best->common.mask; } @@ -219,7 +219,7 @@ static void place_core(struct irq_info *info, void *data) if (place.best) { migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info); - info->assigned_obj = place.best; + info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; info->mask = place.best->common.mask; } @@ -265,7 +265,7 @@ static void place_irq_in_package(struct irq_info *info, void *unused __attribute if (place.best) { migrate_irq(NULL, &place.best->common.interrupts, info); - info->assigned_obj = place.best; + info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; place.best->class_count[info->class]++; info->mask = place.best->common.mask; @@ -376,7 +376,7 @@ void calculate_placement(void) sort_irq_list(); do_unroutables(); - for_each_irq(NULL, place_irq_in_package, NULL); + for_each_irq(rebalance_irq_list, place_irq_in_package, NULL); for_each_package(NULL, place_cache_domain, NULL); for_each_cache_domain(NULL, place_cores, NULL); diff --git a/types.h b/types.h index 1cc993f..6551e47 100644 --- a/types.h +++ b/types.h @@ -76,7 +76,7 @@ struct irq_info { uint64_t last_irq_count; uint64_t workload; int moved; - void *assigned_obj; + struct common_obj_data *assigned_obj; }; #endif From 56bf21a6e57fb7827d05fe20fe28f52fee3233ca Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 14:39:27 -0400 Subject: [PATCH 13/44] Filter activation on moved flag the new irq_info structure flags irqs that have been migrated. We can just test that instead of having to compare mask values --- activate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/activate.c b/activate.c index 8c267a7..d07ec01 100644 --- a/activate.c +++ b/activate.c @@ -37,9 +37,10 @@ static void activate_mapping(struct irq_info *info, void *data __attribute__((un char buf[PATH_MAX]; FILE *file; - if (info->level == BALANCE_NONE) - return; - if (cpus_equal(info->mask, info->old_mask)) + /* + * only activate mappings for irqs that have moved + */ + if (!info->moved) return; sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); From 02532f932199830f8ea4b2b45cef4398a25bdcee Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 14:46:00 -0400 Subject: [PATCH 14/44] Filter all placement calculations on moved flag We don't want to consider any irqs that have not been marked as needing to be rebalanced --- placement.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/placement.c b/placement.c index 45006b6..4162508 100644 --- a/placement.c +++ b/placement.c @@ -159,6 +159,9 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) struct package *p = data; struct cache_domain_placement place; + if (!info->moved) + return; + if (info->level <= BALANCE_PACKAGE) return; @@ -207,6 +210,9 @@ static void place_core(struct irq_info *info, void *data) struct cache_domain *c = data; struct core_placement place; + if (!info->moved) + return; + if ((info->level <= BALANCE_CACHE) && (!one_shot_mode)) return; @@ -254,6 +260,9 @@ static void place_irq_in_package(struct irq_info *info, void *unused __attribute { struct package_placement place; + if (!info->moved) + return; + if (info->level == BALANCE_NONE) return; From 02c401215d453b492f9010d78ca2b8524a7877b2 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 15:38:53 -0400 Subject: [PATCH 15/44] Make NUMA node a top level placement decision We've been trying to shoehorn in numa awareness for awhile, this prioritizes numa placement as the first decision the balancer makes. We now select placement for irqs in this order: 1) Numa node 2) package 3) cache domain 4) core and we stop balancing at the level indicated by the irqs class. --- placement.c | 98 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/placement.c b/placement.c index 4162508..4d8fd55 100644 --- a/placement.c +++ b/placement.c @@ -36,20 +36,10 @@ static uint64_t package_cost_func(struct irq_info *irq, struct package *package) { int bonus = 0; int maxcount; - int dist; /* moving to a cold package/cache/etc gets you a 3000 penalty */ if (!cpus_intersects(irq->old_mask, package->common.mask)) bonus = CROSS_PACKAGE_PENALTY; - /* do a little numa affinity */ - if (irq_numa_node(irq)->common.number != package_numa_node(package)->common.number) { - if (irq_numa_node(irq)->common.number >= 0 && package_numa_node(package)->common.number >= 0) { - dist = numa_distance(irq_numa_node(irq)->common.number, package_numa_node(package)->common.number); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; - } - } - /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) bonus = bonus / 10; @@ -69,21 +59,11 @@ static uint64_t package_cost_func(struct irq_info *irq, struct package *package) static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain *cache_domain) { int bonus = 0; - int dist; /* moving to a cold cache gets you a 1500 penalty */ if (!cpus_intersects(irq->old_mask, cache_domain->common.mask)) bonus = CROSS_PACKAGE_PENALTY/2; - /* do a little numa affinity */ - if (irq_numa_node(irq)->common.number != cache_domain_numa_node(cache_domain)->common.number) { - if (irq_numa_node(irq)->common.number >= 0 && cache_domain_numa_node(cache_domain)->common.number >= 0) { - dist = numa_distance(irq_numa_node(irq)->common.number, cache_domain_numa_node(cache_domain)->common.number); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; - } - } - /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) bonus = bonus / 10; @@ -104,21 +84,11 @@ static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) { int bonus = 0; - int dist; /* moving to a colder core gets you a 1000 penalty */ if (!cpus_intersects(irq->old_mask, cpu->common.mask)) bonus = CROSS_PACKAGE_PENALTY/3; - /* do a little numa affinity */ - if (irq_numa_node(irq)->common.number != cpu_numa_node(cpu)->common.number) { - if (irq_numa_node(irq)->common.number >= 0 && cpu_numa_node(cpu)->common.number >= 0) { - dist = numa_distance(irq_numa_node(irq)->common.number, cpu_numa_node(cpu)->common.number); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; - } - } - /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) bonus = bonus / 10; @@ -256,9 +226,10 @@ static void find_best_package(struct package *p, void *data) } } -static void place_irq_in_package(struct irq_info *info, void *unused __attribute__((unused))) +static void place_irq_in_package(struct irq_info *info, void *data) { struct package_placement place; + struct numa_node *n = data; if (!info->moved) return; @@ -270,10 +241,10 @@ static void place_irq_in_package(struct irq_info *info, void *unused __attribute place.best = NULL; place.info = info; - for_each_package(NULL, find_best_package, &place); + for_each_package(n->packages, find_best_package, &place); if (place.best) { - migrate_irq(NULL, &place.best->common.interrupts, info); + migrate_irq(&n->common.interrupts, &place.best->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; place.best->class_count[info->class]++; @@ -281,6 +252,64 @@ static void place_irq_in_package(struct irq_info *info, void *unused __attribute } } +static void place_packages(struct numa_node *n, void *data __attribute__((unused))) +{ + if (n->common.interrupts) + for_each_irq(n->common.interrupts, place_irq_in_package, n); +} + +struct node_placement { + struct irq_info *info; + struct numa_node *best; + uint64_t best_cost; +}; + +static void find_best_node(struct numa_node *n, void *data) +{ + struct node_placement *place = data; + + /* + * Just find the least loaded node + */ + if (n->common.workload < place->best_cost) { + place->best = n; + place->best_cost = n->common.workload; + } +} + +static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) +{ + struct node_placement place; + + if( info->level == BALANCE_NONE) + return; + + if (irq_numa_node(info)->common.number != -1) { + /* + * This irq belongs to a device with a preferred numa node + * put it on that node + */ + migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info); + info->assigned_obj = (struct common_obj_data *)irq_numa_node(info); + irq_numa_node(info)->common.workload += info->workload + 1; + info->mask = irq_numa_node(info)->common.mask; + return; + } + + place.best_cost = INT_MAX; + place.best = NULL; + place.info = info; + + for_each_numa_node(NULL, find_best_node, &place); + + if (place.best) { + migrate_irq(&rebalance_irq_list, &place.best->common.interrupts, info); + info->assigned_obj = (struct common_obj_data *)place.best; + place.best->common.workload += info->workload + 1; + info->mask = place.best->common.mask; + } +} + static void place_irq_affinity_hint(struct irq_info *info, void *data __attribute__((unused))) { @@ -385,7 +414,8 @@ void calculate_placement(void) sort_irq_list(); do_unroutables(); - for_each_irq(rebalance_irq_list, place_irq_in_package, NULL); + for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); + for_each_numa_node(NULL, place_packages, NULL); for_each_package(NULL, place_cache_domain, NULL); for_each_cache_domain(NULL, place_cores, NULL); From 3176fa04d45f0cd71d33a8a59fc2ac30f883f084 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 16:07:27 -0400 Subject: [PATCH 16/44] Sort only irqs that are getting rebalanced we only need to order irqs that are getting moved. --- classify.c | 4 ++-- irqbalance.c | 2 +- irqbalance.h | 2 +- placement.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/classify.c b/classify.c index dc5f05b..c504543 100644 --- a/classify.c +++ b/classify.c @@ -345,7 +345,7 @@ static gint sort_irqs(gconstpointer A, gconstpointer B) } -void sort_irq_list(void) +void sort_irq_list(GList **list) { - interrupts_db = g_list_sort(interrupts_db, sort_irqs); + *list = g_list_sort(*list, sort_irqs); } diff --git a/irqbalance.c b/irqbalance.c index 3ab3bee..514d9c6 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -193,11 +193,11 @@ int main(int argc, char** argv) reset_counts(); parse_proc_interrupts(); calculate_workload(); - sort_irq_list(); if (debug_mode) dump_workloads(); for_each_irq(NULL, force_rebalance_irq, NULL); + sort_irq_list(&rebalance_irq_list); while (1) { sleep_approx(SLEEP_INTERVAL); diff --git a/irqbalance.h b/irqbalance.h index a1833ba..b0e1c7e 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -34,7 +34,7 @@ extern GList *rebalance_irq_list; void calculate_workload(void); void reset_counts(void); void dump_workloads(void); -void sort_irq_list(void); +void sort_irq_list(GList **list); void calculate_placement(void); void dump_tree(void); diff --git a/placement.c b/placement.c index 4d8fd55..dc04c52 100644 --- a/placement.c +++ b/placement.c @@ -411,7 +411,7 @@ void calculate_placement(void) /* first clear old data */ clear_work_stats(); - sort_irq_list(); + sort_irq_list(&rebalance_irq_list); do_unroutables(); for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); From 0ee17356719bb7a6e91234aadf07258d83cc4fb0 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 16:19:08 -0400 Subject: [PATCH 17/44] Remove mask and old_mask values from irq_info We don't need them anymore, because unroutable irqs just don't get touched anymore, and we have the assigned_obj pointer to gather our mask value from during activation. Note - This removal does necessecitate the removal of affinity hint, but we'll be reimplementing that soon, as the prior policy was rather inflexible. --- activate.c | 8 +++-- classify.c | 1 + irqbalance.c | 1 - placement.c | 86 ---------------------------------------------------- types.h | 2 -- 5 files changed, 7 insertions(+), 91 deletions(-) diff --git a/activate.c b/activate.c index d07ec01..b201e9b 100644 --- a/activate.c +++ b/activate.c @@ -43,15 +43,19 @@ static void activate_mapping(struct irq_info *info, void *data __attribute__((un if (!info->moved) return; + if (!info->assigned_obj) + return; + + sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); file = fopen(buf, "w"); if (!file) return; - cpumask_scnprintf(buf, PATH_MAX, info->mask); + cpumask_scnprintf(buf, PATH_MAX, info->assigned_obj->mask); fprintf(file, "%s", buf); fclose(file); - info->old_mask = info->mask; + info->moved = 0; /*migration is done*/ } void activate_mappings(void) diff --git a/classify.c b/classify.c index c504543..5c6a58a 100644 --- a/classify.c +++ b/classify.c @@ -323,6 +323,7 @@ void migrate_irq(GList **from, GList **to, struct irq_info *info) *to = g_list_append(*to, tmp); + info->moved = 1; } static gint sort_irqs(gconstpointer A, gconstpointer B) diff --git a/irqbalance.c b/irqbalance.c index 514d9c6..0951dc8 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -129,7 +129,6 @@ static void dump_object_tree() static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) { - info->moved = 1; migrate_irq((info->assigned_obj ? &info->assigned_obj->interrupts : NULL), &rebalance_irq_list, info); } diff --git a/placement.c b/placement.c index dc04c52..acd1bcf 100644 --- a/placement.c +++ b/placement.c @@ -36,9 +36,6 @@ static uint64_t package_cost_func(struct irq_info *irq, struct package *package) { int bonus = 0; int maxcount; - /* moving to a cold package/cache/etc gets you a 3000 penalty */ - if (!cpus_intersects(irq->old_mask, package->common.mask)) - bonus = CROSS_PACKAGE_PENALTY; /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) @@ -60,10 +57,6 @@ static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain { int bonus = 0; - /* moving to a cold cache gets you a 1500 penalty */ - if (!cpus_intersects(irq->old_mask, cache_domain->common.mask)) - bonus = CROSS_PACKAGE_PENALTY/2; - /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) bonus = bonus / 10; @@ -85,10 +78,6 @@ static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) { int bonus = 0; - /* moving to a colder core gets you a 1000 penalty */ - if (!cpus_intersects(irq->old_mask, cpu->common.mask)) - bonus = CROSS_PACKAGE_PENALTY/3; - /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) bonus = bonus / 10; @@ -145,7 +134,6 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)place.best; place.best->class_count[info->class]++; - info->mask = place.best->common.mask; } } @@ -197,7 +185,6 @@ static void place_core(struct irq_info *info, void *data) migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; - info->mask = place.best->common.mask; } } @@ -248,7 +235,6 @@ static void place_irq_in_package(struct irq_info *info, void *data) info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; place.best->class_count[info->class]++; - info->mask = place.best->common.mask; } } @@ -292,7 +278,6 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)irq_numa_node(info); irq_numa_node(info)->common.workload += info->workload + 1; - info->mask = irq_numa_node(info)->common.mask; return; } @@ -306,73 +291,9 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u migrate_irq(&rebalance_irq_list, &place.best->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)place.best; place.best->common.workload += info->workload + 1; - info->mask = place.best->common.mask; } } -static void place_irq_affinity_hint(struct irq_info *info, void *data __attribute__((unused))) -{ - - if (info->level == BALANCE_NONE) - return; - - if ((!cpus_empty(irq_numa_node(info)->common.mask)) && - (!cpus_equal(info->mask, irq_numa_node(info)->common.mask)) && - (!__cpus_full(&irq_numa_node(info)->common.mask, num_possible_cpus()))) { - info->old_mask = info->mask; - info->mask = irq_numa_node(info)->common.mask; - } -} - -static void place_affinity_hint(void) -{ - for_each_irq(NULL, place_irq_affinity_hint, NULL); -} - - -static void check_cpu_irq_route(struct cpu_core *c, void *data) -{ - struct irq_info *info = data; - - if (cpus_intersects(c->common.mask, irq_numa_node(info)->common.mask) || - cpus_intersects(c->common.mask, info->mask)) - c->common.workload += info->workload; -} - -static void check_cd_irq_route(struct cache_domain *c, void *data) -{ - struct irq_info *info = data; - - if (cpus_intersects(c->common.mask, irq_numa_node(info)->common.mask) || - cpus_intersects(c->common.mask, info->mask)) - c->common.workload += info->workload; -} - -static void check_package_irq_route(struct package *p, void *data) -{ - struct irq_info *info = data; - - if (cpus_intersects(p->common.mask, irq_numa_node(info)->common.mask) || - cpus_intersects(p->common.mask, info->mask)) - p->common.workload += info->workload; -} - -static void check_irq_route(struct irq_info *info, void *data __attribute__((unused))) -{ - - if (info->level != BALANCE_NONE) - return; - - for_each_package(NULL, check_package_irq_route, info); - for_each_cache_domain(NULL, check_cd_irq_route, info); - for_each_cpu_core(NULL, check_cpu_irq_route, info); -} - -static void do_unroutables(void) -{ - for_each_irq(NULL, check_irq_route, NULL); -} - static void validate_irq(struct irq_info *info, void *data) { printf("Validating irq %d %p against %p\n", info->irq, info->assigned_obj, data); @@ -412,19 +333,12 @@ void calculate_placement(void) clear_work_stats(); sort_irq_list(&rebalance_irq_list); - do_unroutables(); for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); for_each_numa_node(NULL, place_packages, NULL); for_each_package(NULL, place_cache_domain, NULL); for_each_cache_domain(NULL, place_cores, NULL); - /* - * if affinity_hint is populated on irq and is not set to - * all CPUs (meaning it's initialized), honor that above - * anything in the package locality/workload. - */ - place_affinity_hint(); if (debug_mode) validate_object_tree_placement(); } diff --git a/types.h b/types.h index 6551e47..0b5935e 100644 --- a/types.h +++ b/types.h @@ -70,8 +70,6 @@ struct irq_info { struct numa_node *numa_node; cpumask_t cpumask; cpumask_t affinity_hint; - cpumask_t mask; /*this will go away soon*/ - cpumask_t old_mask; /*this will go away soon*/ uint64_t irq_count; uint64_t last_irq_count; uint64_t workload; From a41d357c020332886dcccecdf882e284f4018fcd Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 20:23:42 -0400 Subject: [PATCH 18/44] Reset irq assignments on cputree rparse If we reparse the cpu tree we need to rebalance all the irqs that we are tracking --- irqbalance.c | 1 + 1 file changed, 1 insertion(+) diff --git a/irqbalance.c b/irqbalance.c index 0951dc8..5f33500 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -219,6 +219,7 @@ int main(int argc, char** argv) free_object_tree(); build_object_tree(); + for_each_irq(NULL, force_rebalance_irq, NULL); } calculate_workload(); From 507b815ff41471de7ef82866e79906710288cef4 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Oct 2011 20:41:17 -0400 Subject: [PATCH 19/44] Add load metric storage for new balance algorithm Add storage to track cpu load of irq and softirq time for each balancing object --- cputree.c | 57 ++++++++++++++++++++++++++++++------------------------- types.h | 4 ++++ 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/cputree.c b/cputree.c index e0ba10e..adaea75 100644 --- a/cputree.c +++ b/cputree.c @@ -293,6 +293,36 @@ void dump_tree(void) for_each_package(NULL, dump_package, buffer); } +static void clear_cpu_stats(struct cpu_core *c, void *data __attribute__((unused))) +{ + memset(c->class_count, 0, sizeof(c->class_count)); + c->common.workload = 0; + c->common.load = 0; +} + +static void clear_cd_stats(struct cache_domain *c, void *data __attribute__((unused))) +{ + memset(c->class_count, 0, sizeof(c->class_count)); + c->common.workload = 0; + c->common.load = 0; + for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL); +} + +static void clear_package_stats(struct package *p, void *data __attribute__((unused))) +{ + memset(p->class_count, 0, sizeof(p->class_count)); + p->common.workload = 0; + p->common.load = 0; + for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL); +} + +static void clear_node_stats(struct numa_node *n, void *data __attribute__((unused))) +{ + n->common.workload = 0; + n->common.load = 0; + for_each_package(n->packages, clear_package_stats, NULL); +} + /* * this function removes previous state from the cpu tree, such as * which level does how much work and the actual lists of interrupts @@ -300,32 +330,7 @@ void dump_tree(void) */ void clear_work_stats(void) { - GList *p_iter, *c_iter, *cp_iter; - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - - p_iter = g_list_first(packages); - while (p_iter) { - package = p_iter->data; - package->common.workload = 0; - c_iter = g_list_first(package->cache_domains); - memset(package->class_count, 0, sizeof(package->class_count)); - while (c_iter) { - cache_domain = c_iter->data; - c_iter = g_list_next(c_iter); - cache_domain->common.workload = 0; - cp_iter = cache_domain->cpu_cores; - memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count)); - while (cp_iter) { - cpu = cp_iter->data; - cp_iter = g_list_next(cp_iter); - cpu->common.workload = 0; - memset(cpu->class_count, 0, sizeof(cpu->class_count)); - } - } - p_iter = g_list_next(p_iter); - } + for_each_numa_node(NULL, clear_node_stats, NULL); } diff --git a/types.h b/types.h index 0b5935e..f380d04 100644 --- a/types.h +++ b/types.h @@ -29,6 +29,7 @@ struct common_obj_data { uint64_t workload; + uint64_t load; int number; cpumask_t mask; GList *interrupts; @@ -60,6 +61,8 @@ struct cpu_core { int marker; struct cache_domain *cache_domain; int class_count[7]; + uint64_t irq_load; + uint64_t softirq_load; }; struct irq_info { @@ -73,6 +76,7 @@ struct irq_info { uint64_t irq_count; uint64_t last_irq_count; uint64_t workload; + uint64_t load; int moved; struct common_obj_data *assigned_obj; }; From 594a8b89a780c7726f73aadb18e15eb733e109fd Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 4 Oct 2011 13:27:06 -0400 Subject: [PATCH 20/44] reset assigned_obj pointer on rebalance Don't want stale pointers hanging about when we rebalance an object --- irqbalance.c | 1 + 1 file changed, 1 insertion(+) diff --git a/irqbalance.c b/irqbalance.c index 5f33500..6a9d5f8 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -131,6 +131,7 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__( { migrate_irq((info->assigned_obj ? &info->assigned_obj->interrupts : NULL), &rebalance_irq_list, info); + info->assigned_obj = NULL; } int main(int argc, char** argv) From 17d7e1fdaaef98bfcea4f6a417602498a7e03071 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 4 Oct 2011 15:17:39 -0400 Subject: [PATCH 21/44] Adding new balance load gathering function This adds calls to parse_proc_stat to gather load information based on irq and softirq time as collected from proc/stat. The algorithm is: 1) Gather per cpu [soft]irq load (this patch) 2) propagate load up all parent devices (this patch) 3) distribute weighted load to each irq (this patch) Upcomming patches will identify object that are more than a standard deviation out of average that have more than one irq on them and submit those irqs for rebalancing --- cputree.c | 33 ++++++++++++++++ irqbalance.c | 2 + irqbalance.h | 2 + procinterrupts.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+) diff --git a/cputree.c b/cputree.c index adaea75..9d31c91 100644 --- a/cputree.c +++ b/cputree.c @@ -298,6 +298,8 @@ static void clear_cpu_stats(struct cpu_core *c, void *data __attribute__((unused memset(c->class_count, 0, sizeof(c->class_count)); c->common.workload = 0; c->common.load = 0; + c->irq_load = 0; + c->softirq_load = 0; } static void clear_cd_stats(struct cache_domain *c, void *data __attribute__((unused))) @@ -323,6 +325,12 @@ static void clear_node_stats(struct numa_node *n, void *data __attribute__((unus for_each_package(n->packages, clear_package_stats, NULL); } +static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) +{ + info->workload = 0; + info->load = 0; +} + /* * this function removes previous state from the cpu tree, such as * which level does how much work and the actual lists of interrupts @@ -331,6 +339,7 @@ static void clear_node_stats(struct numa_node *n, void *data __attribute__((unus void clear_work_stats(void) { for_each_numa_node(NULL, clear_node_stats, NULL); + for_each_irq(NULL, clear_irq_stats, NULL); } @@ -446,3 +455,27 @@ void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), } } +static gint compare_cpus(gconstpointer a, gconstpointer b) +{ + const struct cpu_core *ai = a; + const struct cpu_core *bi = b; + + return ai->common.number - bi->common.number; +} + +struct cpu_core *find_cpu_core(int cpunr) +{ + GList *entry; + struct cpu_core find; + + find.common.number = cpunr; + entry = g_list_find_custom(cpus, &find, compare_cpus); + + return entry ? entry->data : NULL; +} + +int get_cpu_count(void) +{ + return g_list_length(cpus); +} + diff --git a/irqbalance.c b/irqbalance.c index 6a9d5f8..1d0a75f 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -189,6 +189,7 @@ int main(int argc, char** argv) #endif parse_proc_interrupts(); + parse_proc_stat(); sleep(SLEEP_INTERVAL/4); reset_counts(); parse_proc_interrupts(); @@ -207,6 +208,7 @@ int main(int argc, char** argv) check_power_mode(); parse_proc_interrupts(); + parse_proc_stat(); /* cope with cpu hotplug -- detected during /proc/interrupts parsing */ if (need_cpu_rescan) { diff --git a/irqbalance.h b/irqbalance.h index b0e1c7e..2443d77 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -75,6 +75,8 @@ extern void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) extern void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), void *data); +extern struct cpu_core *find_cpu_core(int cpunr); +extern int get_cpu_count(void); /* * irq db functions diff --git a/procinterrupts.c b/procinterrupts.c index 094a7e1..0837248 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -47,6 +47,7 @@ void parse_proc_interrupts(void) /* first line is the header we don't need; nuke it */ if (getline(&line, &size, file)==0) { free(line); + fclose(file); return; } @@ -111,3 +112,100 @@ void parse_proc_interrupts(void) fclose(file); free(line); } + + +static void accumulate_irq_count(struct irq_info *info, void *data) +{ + uint64_t *acc = data; + + *acc += (info->irq_count - info->last_irq_count); +} + +static void assign_load_slice(struct irq_info *info, void *data) +{ + uint64_t *load_slice = data; + info->load = (info->irq_count - info->last_irq_count) * *load_slice; +} + +static void compute_irq_load_share(struct cpu_core *cpu, void *data __attribute__((unused))) +{ + uint64_t total_irq_counts = 0; + uint64_t load_slice; + + for_each_irq(cpu->common.interrupts, accumulate_irq_count, &total_irq_counts); + + load_slice = cpu->common.load / total_irq_counts; + + for_each_irq(cpu->common.interrupts, assign_load_slice, &load_slice); +} + +void parse_proc_stat() +{ + FILE *file; + char *line = NULL; + size_t size = 0; + int cpunr, rc, cpucount; + struct cpu_core *cpu; + struct common_obj_data *parent; + int irq_load, softirq_load; + + file = fopen("/proc/stat", "r"); + if (!file) { + syslog(LOG_WARNING, "WARNING cant open /proc/stat. balacing is broken\n"); + return; + } + + /* first line is the header we don't need; nuke it */ + if (getline(&line, &size, file)==0) { + free(line); + syslog(LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n"); + fclose(file); + return; + } + + cpucount = 0; + while (!feof(file)) { + if (getline(&line, &size, file)==0) + break; + + if (!strstr(line, "cpu")) + break; + + cpunr = strtoul(&line[3], NULL, 10); + + rc = sscanf(line, "%*s %*d %*d %*d %*d %*d %d %d", &irq_load, &softirq_load); + if (rc < 2) + break; + + cpu = find_cpu_core(cpunr); + + if (!cpu) + break; + + cpucount++; + + /* + * For each cpu add the irq and softirq load and propagate that + * all the way up the device tree + */ + cpu->irq_load = irq_load; + cpu->softirq_load = softirq_load; + cpu->common.load = irq_load + softirq_load; + cpu->cache_domain->common.load += cpu->common.load; + cpu->cache_domain->package->common.load += cpu->common.load; + cpu->cache_domain->package->numa_node->common.load += cpu->common.load; + } + + fclose(file); + if (cpucount != get_cpu_count()) { + syslog(LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n"); + return; + } + + /* + * Now that we have load for each cpu attribute a fair share of the load + * to each irq on that cpu + */ + for_each_cpu_core(NULL, compute_irq_load_share, NULL); + +} From d04c853443012d3d9f51265fdd97611413524cc8 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 5 Oct 2011 13:40:51 -0400 Subject: [PATCH 22/44] convert for_each_* iterators to use common_obj_data struct This allows more opportuinity to share code among multiple objects --- cputree.c | 27 +++++++++++++++++---------- irqbalance.h | 11 ++++++----- numa.c | 5 +++-- placement.c | 45 ++++++++++++++++++++------------------------- 4 files changed, 46 insertions(+), 42 deletions(-) diff --git a/cputree.c b/cputree.c index 9d31c91..ec2315c 100644 --- a/cputree.c +++ b/cputree.c @@ -258,15 +258,17 @@ static void dump_irq(struct irq_info *info, void *data) printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->workload); } -static void dump_cpu_core(struct cpu_core *c, void *data __attribute__((unused))) +static void dump_cpu_core(struct common_obj_data *d, void *data __attribute__((unused))) { + struct cpu_core *c = (struct cpu_core *)d; printf(" CPU number %i numa_node is %d (workload %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.workload); if (c->common.interrupts) for_each_irq(c->common.interrupts, dump_irq, (void *)18); } -static void dump_cache_domain(struct cache_domain *c, void *data) +static void dump_cache_domain(struct common_obj_data *d, void *data) { + struct cache_domain *c = (struct cache_domain *)d; char *buffer = data; cpumask_scnprintf(buffer, 4095, c->common.mask); printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.workload); @@ -276,8 +278,9 @@ static void dump_cache_domain(struct cache_domain *c, void *data) for_each_irq(c->common.interrupts, dump_irq, (void *)10); } -static void dump_package(struct package *p, void *data) +static void dump_package(struct common_obj_data *d, void *data) { + struct package *p = (struct package *)d; char *buffer = data; cpumask_scnprintf(buffer, 4096, p->common.mask); printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.workload); @@ -293,8 +296,9 @@ void dump_tree(void) for_each_package(NULL, dump_package, buffer); } -static void clear_cpu_stats(struct cpu_core *c, void *data __attribute__((unused))) +static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused))) { + struct cpu_core *c = (struct cpu_core *)d; memset(c->class_count, 0, sizeof(c->class_count)); c->common.workload = 0; c->common.load = 0; @@ -302,24 +306,27 @@ static void clear_cpu_stats(struct cpu_core *c, void *data __attribute__((unused c->softirq_load = 0; } -static void clear_cd_stats(struct cache_domain *c, void *data __attribute__((unused))) +static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused))) { + struct cache_domain *c = (struct cache_domain *)d; memset(c->class_count, 0, sizeof(c->class_count)); c->common.workload = 0; c->common.load = 0; for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL); } -static void clear_package_stats(struct package *p, void *data __attribute__((unused))) +static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused))) { + struct package *p = (struct package *)d; memset(p->class_count, 0, sizeof(p->class_count)); p->common.workload = 0; p->common.load = 0; for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL); } -static void clear_node_stats(struct numa_node *n, void *data __attribute__((unused))) +static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused))) { + struct numa_node *n = (struct numa_node *)d; n->common.workload = 0; n->common.load = 0; for_each_package(n->packages, clear_package_stats, NULL); @@ -419,7 +426,7 @@ void clear_cpu_tree(void) } -void for_each_package(GList *list, void (*cb)(struct package *p, void *data), void *data) +void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *data), void *data) { GList *entry = g_list_first(list ? list : packages); GList *next; @@ -431,7 +438,7 @@ void for_each_package(GList *list, void (*cb)(struct package *p, void *data), vo } } -void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c, void *data), void *data) +void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data) { GList *entry = g_list_first(list ? list : cache_domains); GList *next; @@ -443,7 +450,7 @@ void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c, void } } -void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), void *data) +void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data) { GList *entry = g_list_first(list ? list : cpus); GList *next; diff --git a/irqbalance.h b/irqbalance.h index 2443d77..81e4cf5 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -26,6 +26,7 @@ extern int one_shot_mode; extern void parse_cpu_tree(void); extern void clear_work_stats(void); extern void parse_proc_interrupts(void); +extern void parse_proc_stat(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); @@ -50,8 +51,8 @@ void pci_numa_scan(void); */ extern void build_numa_node_list(void); extern void free_numa_node_list(void); -extern void dump_numa_node_info(struct numa_node *node, void *data); -extern void for_each_numa_node(GList *list, void (*cb)(struct numa_node *node, void *data), void *data); +extern void dump_numa_node_info(struct common_obj_data *node, void *data); +extern void for_each_numa_node(GList *list, void (*cb)(struct common_obj_data *node, void *data), void *data); extern void add_package_to_node(struct package *p, int nodeid); extern struct numa_node *get_numa_node(int nodeid); @@ -59,14 +60,14 @@ extern struct numa_node *get_numa_node(int nodeid); * Package functions */ #define package_numa_node(p) ((p)->numa_node) -extern void for_each_package(GList *list, void (*cb)(struct package *p, void *data), void *data); +extern void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *data), void *data); /* * cache_domain functions */ #define cache_domain_package(c) ((c)->package) #define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) -extern void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c, void *data), void *data); +extern void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); /* * cpu core functions @@ -74,7 +75,7 @@ extern void for_each_cache_domain(GList *list, void (*cb)(struct cache_domain *c #define cpu_cache_domain(cpu) ((cpu)->cache_domain) #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) -extern void for_each_cpu_core(GList *list, void (*cb)(struct cpu_core *c, void *data), void *data); +extern void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); extern struct cpu_core *find_cpu_core(int cpunr); extern int get_cpu_count(void); diff --git a/numa.c b/numa.c index 1173529..c239c55 100644 --- a/numa.c +++ b/numa.c @@ -132,8 +132,9 @@ void add_package_to_node(struct package *p, int nodeid) p->numa_node = node; } -void dump_numa_node_info(struct numa_node *node, void *unused __attribute__((unused))) +void dump_numa_node_info(struct common_obj_data *d, void *unused __attribute__((unused))) { + struct numa_node *node = (struct numa_node *)d; char buffer[4096]; printf("NUMA NODE NUMBER: %d\n", node->common.number); @@ -142,7 +143,7 @@ void dump_numa_node_info(struct numa_node *node, void *unused __attribute__((unu printf("\n"); } -void for_each_numa_node(GList *list, void(*cb)(struct numa_node *node, void *data), void *data) +void for_each_numa_node(GList *list, void(*cb)(struct common_obj_data *node, void *data), void *data) { GList *entry, *next; diff --git a/placement.c b/placement.c index acd1bcf..b6919fa 100644 --- a/placement.c +++ b/placement.c @@ -101,8 +101,9 @@ struct cache_domain_placement { uint64_t best_cost; }; -static void find_best_cd(struct cache_domain *c, void *data) +static void find_best_cd(struct common_obj_data *d, void *data) { + struct cache_domain *c = (struct cache_domain *)d; struct cache_domain_placement *best = data; uint64_t newload; @@ -138,8 +139,9 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) } -static void place_cache_domain(struct package *package, void *data __attribute__((unused))) +static void place_cache_domain(struct common_obj_data *d, void *data __attribute__((unused))) { + struct package *package = (struct package *)d; if (package->common.interrupts) for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package); } @@ -151,8 +153,9 @@ struct core_placement { struct irq_info *info; }; -static void place_irq_in_core(struct cpu_core *c, void *data) +static void place_irq_in_core(struct common_obj_data *d, void *data) { + struct cpu_core *c = (struct cpu_core *)d; struct core_placement *best = data; uint64_t newload; @@ -189,8 +192,9 @@ static void place_core(struct irq_info *info, void *data) } -static void place_cores(struct cache_domain *cache_domain, void *data __attribute__((unused))) +static void place_cores(struct common_obj_data *d, void *data __attribute__((unused))) { + struct cache_domain *cache_domain = (struct cache_domain *)d; if (cache_domain->common.interrupts) for_each_irq(cache_domain->common.interrupts, place_core, cache_domain); } @@ -201,8 +205,9 @@ struct package_placement { uint64_t best_cost; }; -static void find_best_package(struct package *p, void *data) +static void find_best_package(struct common_obj_data *d, void *data) { + struct package *p = (struct package *)d; uint64_t newload; struct package_placement *place = data; @@ -238,8 +243,9 @@ static void place_irq_in_package(struct irq_info *info, void *data) } } -static void place_packages(struct numa_node *n, void *data __attribute__((unused))) +static void place_packages(struct common_obj_data *d, void *data __attribute__((unused))) { + struct numa_node *n = (struct numa_node *)d; if (n->common.interrupts) for_each_irq(n->common.interrupts, place_irq_in_package, n); } @@ -250,8 +256,9 @@ struct node_placement { uint64_t best_cost; }; -static void find_best_node(struct numa_node *n, void *data) +static void find_best_node(struct common_obj_data *d, void *data) { + struct numa_node *n = (struct numa_node *)d; struct node_placement *place = data; /* @@ -302,29 +309,17 @@ static void validate_irq(struct irq_info *info, void *data) info->irq, info->assigned_obj, data); } -static void validate_package(struct package *p, void *data __attribute__((unused))) +static void validate_object(struct common_obj_data *d, void *data __attribute__((unused))) { - if (p->common.interrupts) - for_each_irq(p->common.interrupts, validate_irq, p); -} - -static void validate_cd(struct cache_domain *c, void *data __attribute__((unused))) -{ - if (c->common.interrupts) - for_each_irq(c->common.interrupts, validate_irq, c); -} - -static void validate_cpu(struct cpu_core *c, void *data __attribute__((unused))) -{ - if (c->common.interrupts) - for_each_irq(c->common.interrupts, validate_irq, c); + if (d->interrupts) + for_each_irq(d->interrupts, validate_irq, d); } static void validate_object_tree_placement() { - for_each_package(NULL, validate_package, NULL); - for_each_cache_domain(NULL, validate_cd, NULL); - for_each_cpu_core(NULL, validate_cpu, NULL); + for_each_package(NULL, validate_object, NULL); + for_each_cache_domain(NULL, validate_object, NULL); + for_each_cpu_core(NULL, validate_object, NULL); } void calculate_placement(void) From 3953fec6e4c492531c3ddeda5924c3509583947c Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 5 Oct 2011 16:35:09 -0400 Subject: [PATCH 23/44] Add new workload computation to irqbalance This new workload calculator computes the average load at every balance level and identifies each object at each level that is more than a standard deviation away from the average. For those above the average with more than one irq assigned to them, those irqs are placed on the re-assignment list for rebalancing until such time as the approximate reduction in load brings that object back to the average. --- Makefile.am | 2 +- irqlist.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ procinterrupts.c | 4 +- 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/Makefile.am b/Makefile.am index 5624af4..3d37155 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4 INCLUDES = -I${top_srcdir} -LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma +LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma -lm AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \ types.h diff --git a/irqlist.c b/irqlist.c index 35ef317..e69f220 100644 --- a/irqlist.c +++ b/irqlist.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "types.h" #include "irqbalance.h" @@ -65,13 +66,107 @@ void build_workload(struct irq_info *info, void *unused __attribute__((unused))) info->last_irq_count = info->irq_count; } +struct load_balance_info { + unsigned long long int total_load; + unsigned long long avg_load; + int load_sources; + unsigned long long int deviations; + long double std_deviation; +}; + +static void gather_load_stats(struct common_obj_data *obj, void *data) +{ + struct load_balance_info *info = data; + + info->total_load += obj->load; + info->load_sources += 1; +} + +static void compute_deviations(struct common_obj_data *obj, void *data) +{ + struct load_balance_info *info = data; + unsigned long long int deviation; + + deviation = (obj->load > info->avg_load) ? + obj->load - info->avg_load : + info->avg_load - obj->load; + + info->deviations += (deviation * deviation); +} + +static void move_candidate_irqs(struct irq_info *info, void *data) +{ + int *remaining_deviation = (int *)data; + + if (g_list_length(info->assigned_obj->interrupts) <= 1) + return; + if (*remaining_deviation <= 0) + return; + + *remaining_deviation -= info->load; + + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + + info->assigned_obj = NULL; +} + +static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data) +{ + struct load_balance_info *info = data; + int deviation; + + /* + * Don't rebalance irqs on objects whos load is below the average + */ + if (obj->load <= info->avg_load) + return; + + deviation = obj->load - info->avg_load; + + + if ((deviation > info->std_deviation) && + (g_list_length(obj->interrupts) > 1)) { + /* + * We have a cpu that is overloaded and + * has irqs that can be moved to fix that + */ + + /* order the list from least to greatest workload */ + sort_irq_list(&obj->interrupts); + /* + * Each irq carries a weighted average amount of load + * we think its responsible for. Set deviation to be the load + * of the difference between this objects load and the averate, + * and migrate irqs until we only have one left, or until that + * difference reaches zero + */ + for_each_irq(NULL, move_candidate_irqs, &deviation); + } + +} + +#define find_overloaded_objs(name, info) do {\ + memset(&(info), 0, sizeof(struct load_balance_info));\ + for_each_##name(NULL, gather_load_stats, &(info));\ + (info).avg_load = (info).total_load / (info).load_sources;\ + for_each_##name(NULL, compute_deviations, &(info));\ + (info).std_deviation = (long double)((info).deviations / ((info).load_sources));\ + (info).std_deviation = sqrt((info).std_deviation);\ + for_each_##name(NULL, migrate_overloaded_irqs, &(info));\ +}while(0) + void calculate_workload(void) { int i; + struct load_balance_info info; for (i=0; i<7; i++) class_counts[i]=0; for_each_irq(NULL, build_workload, NULL); + find_overloaded_objs(cpu_core, info); + find_overloaded_objs(cache_domain, info); + find_overloaded_objs(package, info); + find_overloaded_objs(numa_node, info); } static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused))) diff --git a/procinterrupts.c b/procinterrupts.c index 0837248..20c5551 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -127,8 +127,9 @@ static void assign_load_slice(struct irq_info *info, void *data) info->load = (info->irq_count - info->last_irq_count) * *load_slice; } -static void compute_irq_load_share(struct cpu_core *cpu, void *data __attribute__((unused))) +static void compute_irq_load_share(struct common_obj_data *d, void *data __attribute__((unused))) { + struct cpu_core *cpu = (struct cpu_core *)d; uint64_t total_irq_counts = 0; uint64_t load_slice; @@ -146,7 +147,6 @@ void parse_proc_stat() size_t size = 0; int cpunr, rc, cpucount; struct cpu_core *cpu; - struct common_obj_data *parent; int irq_load, softirq_load; file = fopen("/proc/stat", "r"); From 93f959c9a6755ad6a5685e9e588a9d24375e539b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 6 Oct 2011 11:11:58 -0400 Subject: [PATCH 24/44] Cut over to base irq placement using new algorithm This is the big move. The main loop now uses the new balance alg based on standard deviation away from the average softirq+irq time as read from /proc/stat. Initial results look good. Also cleaned out old data from previous algorithm, so we don't have any dangling mess --- classify.c | 4 +- cputree.c | 16 +--- irqbalance.c | 21 ++--- irqbalance.h | 2 +- irqlist.c | 46 +++------- numa.c | 2 +- placement.c | 223 +++++++++++++---------------------------------- procinterrupts.c | 5 +- types.h | 5 -- 9 files changed, 90 insertions(+), 234 deletions(-) diff --git a/classify.c b/classify.c index 5c6a58a..be3efa5 100644 --- a/classify.c +++ b/classify.c @@ -336,9 +336,9 @@ static gint sort_irqs(gconstpointer A, gconstpointer B) return 1; if (a->class > b->class) return -1; - if (a->workload < b->workload) + if (a->load < b->load) return 1; - if (a->workload > b->workload) + if (a->load > b->load) return -1; if (airq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->workload); + printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->load); } static void dump_cpu_core(struct common_obj_data *d, void *data __attribute__((unused))) { struct cpu_core *c = (struct cpu_core *)d; - printf(" CPU number %i numa_node is %d (workload %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.workload); + printf(" CPU number %i numa_node is %d (load %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.load); if (c->common.interrupts) for_each_irq(c->common.interrupts, dump_irq, (void *)18); } @@ -271,7 +271,7 @@ static void dump_cache_domain(struct common_obj_data *d, void *data) struct cache_domain *c = (struct cache_domain *)d; char *buffer = data; cpumask_scnprintf(buffer, 4095, c->common.mask); - printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.workload); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.load); if (c->cpu_cores) for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL); if (c->common.interrupts) @@ -283,7 +283,7 @@ static void dump_package(struct common_obj_data *d, void *data) struct package *p = (struct package *)d; char *buffer = data; cpumask_scnprintf(buffer, 4096, p->common.mask); - printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.workload); + printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.load); if (p->cache_domains) for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer); if (p->common.interrupts) @@ -299,8 +299,6 @@ void dump_tree(void) static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused))) { struct cpu_core *c = (struct cpu_core *)d; - memset(c->class_count, 0, sizeof(c->class_count)); - c->common.workload = 0; c->common.load = 0; c->irq_load = 0; c->softirq_load = 0; @@ -309,8 +307,6 @@ static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__( static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused))) { struct cache_domain *c = (struct cache_domain *)d; - memset(c->class_count, 0, sizeof(c->class_count)); - c->common.workload = 0; c->common.load = 0; for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL); } @@ -318,8 +314,6 @@ static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__(( static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused))) { struct package *p = (struct package *)d; - memset(p->class_count, 0, sizeof(p->class_count)); - p->common.workload = 0; p->common.load = 0; for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL); } @@ -327,14 +321,12 @@ static void clear_package_stats(struct common_obj_data *d, void *data __attribut static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused))) { struct numa_node *n = (struct numa_node *)d; - n->common.workload = 0; n->common.load = 0; for_each_package(n->packages, clear_package_stats, NULL); } static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) { - info->workload = 0; info->load = 0; } diff --git a/irqbalance.c b/irqbalance.c index 1d0a75f..6607d38 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -136,6 +136,7 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__( int main(int argc, char** argv) { + int compute_migration_status=0; #ifdef HAVE_GETOPT_LONG parse_command_line(argc, argv); @@ -188,17 +189,7 @@ int main(int argc, char** argv) capng_apply(CAPNG_SELECT_BOTH); #endif - parse_proc_interrupts(); - parse_proc_stat(); - sleep(SLEEP_INTERVAL/4); - reset_counts(); - parse_proc_interrupts(); - calculate_workload(); - if (debug_mode) - dump_workloads(); - for_each_irq(NULL, force_rebalance_irq, NULL); - sort_irq_list(&rebalance_irq_list); while (1) { sleep_approx(SLEEP_INTERVAL); @@ -223,9 +214,14 @@ int main(int argc, char** argv) free_object_tree(); build_object_tree(); for_each_irq(NULL, force_rebalance_irq, NULL); - } + compute_migration_status=0; + } + + if (compute_migration_status) + update_migration_status(); + else + compute_migration_status=1; - calculate_workload(); calculate_placement(); activate_mappings(); @@ -236,7 +232,6 @@ int main(int argc, char** argv) break; counter++; - for_each_irq(NULL, force_rebalance_irq, NULL); } free_object_tree(); return EXIT_SUCCESS; diff --git a/irqbalance.h b/irqbalance.h index 81e4cf5..1321219 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -32,7 +32,7 @@ extern void set_msi_interrupt_numa(int number); extern GList *rebalance_irq_list; -void calculate_workload(void); +void update_migration_status(void); void reset_counts(void); void dump_workloads(void); void sort_irq_list(GList **list); diff --git a/irqlist.c b/irqlist.c index e69f220..462f1b8 100644 --- a/irqlist.c +++ b/irqlist.c @@ -36,36 +36,6 @@ -void get_affinity_hint(struct irq_info *irq, int number) -{ - char buf[PATH_MAX]; - cpumask_t tempmask; - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/affinity_hint", number); - file = fopen(buf, "r"); - if (!file) - return; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - return; - } - cpumask_parse_user(line, strlen(line), tempmask); - if (!__cpus_full(&tempmask, num_possible_cpus())) - irq->affinity_hint = tempmask; - fclose(file); - free(line); -} - -void build_workload(struct irq_info *info, void *unused __attribute__((unused))) -{ - info->workload = info->irq_count - info->last_irq_count + info->workload/3; - class_counts[info->class]++; - info->last_irq_count = info->irq_count; -} - struct load_balance_info { unsigned long long int total_load; unsigned long long avg_load; @@ -98,8 +68,15 @@ static void move_candidate_irqs(struct irq_info *info, void *data) { int *remaining_deviation = (int *)data; + /* Don't rebalance irqs that don't want it */ + if (info->level == BALANCE_NONE) + return; + + /* Don't move cpus that only have one irq, regardless of load */ if (g_list_length(info->assigned_obj->interrupts) <= 1) return; + + /* Stop rebalancing if we've estimated a full reduction of deviation */ if (*remaining_deviation <= 0) return; @@ -155,20 +132,17 @@ static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data) for_each_##name(NULL, migrate_overloaded_irqs, &(info));\ }while(0) -void calculate_workload(void) +void update_migration_status(void) { - int i; struct load_balance_info info; - for (i=0; i<7; i++) - class_counts[i]=0; - for_each_irq(NULL, build_workload, NULL); find_overloaded_objs(cpu_core, info); find_overloaded_objs(cache_domain, info); find_overloaded_objs(package, info); find_overloaded_objs(numa_node, info); } + static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused))) { info->last_irq_count = info->irq_count; @@ -183,7 +157,7 @@ void reset_counts(void) static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) { - printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->workload); + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->load); } void dump_workloads(void) diff --git a/numa.c b/numa.c index c239c55..ce3eb67 100644 --- a/numa.c +++ b/numa.c @@ -39,7 +39,7 @@ GList *numa_nodes = NULL; struct numa_node unspecified_node = { .common = { - .workload = 0, + .load = 0, .number = -1, .mask = CPU_MASK_ALL, .interrupts = NULL, diff --git a/placement.c b/placement.c index b6919fa..8f5623a 100644 --- a/placement.c +++ b/placement.c @@ -32,92 +32,36 @@ int power_mode; GList *rebalance_irq_list; -static uint64_t package_cost_func(struct irq_info *irq, struct package *package) -{ - int bonus = 0; - int maxcount; - - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; - - /* in power save mode, you better be on package 0, with overflow to the next package if really needed */ - if (power_mode) - bonus += POWER_MODE_PACKAGE_THRESHOLD * package->common.number; - - /* if we're out of whack in terms of per class counts.. just block (except in power mode) */ - maxcount = (class_counts[irq->class] + package_count -1 ) / package_count; - if (package->class_count[irq->class]>=maxcount && !power_mode) - bonus += 300000; - - return irq->workload + bonus; -} - -static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain *cache_domain) -{ - int bonus = 0; - - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; - - - /* pay 6000 for each previous interrupt of the same class */ - bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class]; - - /* try to avoid having a lot of MSI interrupt (globally, no by devide id) on - * cache domain */ - if ((irq->type == IRQ_TYPE_MSI) || (irq->type == IRQ_TYPE_MSIX)) - bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class]; - - - return irq->workload + bonus; -} - -static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu) -{ - int bonus = 0; - - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; - - /* - * since some chipsets only place at the first cpu, give a tiny preference to non-first - * cpus for specifically placed interrupts - */ - if (first_cpu(cpu_cache_domain(cpu)->common.mask)==cpu->common.number) - bonus++; - - /* pay 6000 for each previous interrupt of the same class */ - bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class]; - - return irq->workload + bonus; -} - -struct cache_domain_placement { - struct irq_info *info; - struct cache_domain *best; - uint64_t best_cost; +struct obj_placement { + struct common_obj_data *best; + struct common_obj_data *least_irqs; + uint64_t best_cost; + struct irq_info *info; }; -static void find_best_cd(struct common_obj_data *d, void *data) +static void find_best_object(struct common_obj_data *d, void *data) { - struct cache_domain *c = (struct cache_domain *)d; - struct cache_domain_placement *best = data; + struct obj_placement *best = (struct obj_placement *)data; uint64_t newload; - newload = c->common.workload + cache_domain_cost_func(best->info, c); + newload = d->load; if (newload < best->best_cost) { - best->best = c; + best->best = d; best->best_cost = newload; + best->least_irqs = NULL; } -} + + if (newload == best->best_cost) { + if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts)) + best->least_irqs = d; + } +} static void place_irq_in_cache_domain(struct irq_info *info, void *data) { struct package *p = data; - struct cache_domain_placement place; + struct obj_placement place; + struct common_obj_data *asign; if (!info->moved) return; @@ -125,16 +69,19 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) if (info->level <= BALANCE_PACKAGE) return; - place.best_cost = INT_MAX; - place.best = NULL; + place.info = info; + place.best = NULL; + place.least_irqs = NULL; + place.best_cost = INT_MAX; - for_each_cache_domain(p->cache_domains, find_best_cd, &place); + for_each_cache_domain(p->cache_domains, find_best_object, &place); - if (place.best) { - migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info); - info->assigned_obj = (struct common_obj_data *)place.best; - place.best->class_count[info->class]++; + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&p->common.interrupts, &asign->interrupts, info); + info->assigned_obj = asign; } } @@ -146,30 +93,11 @@ static void place_cache_domain(struct common_obj_data *d, void *data __attribute for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package); } - -struct core_placement { - struct cpu_core *best; - uint64_t best_cost; - struct irq_info *info; -}; - -static void place_irq_in_core(struct common_obj_data *d, void *data) -{ - struct cpu_core *c = (struct cpu_core *)d; - struct core_placement *best = data; - uint64_t newload; - - newload = c->common.workload + cpu_cost_func(best->info, c); - if (newload < best->best_cost) { - best->best = c; - best->best_cost = newload; - } -} - static void place_core(struct irq_info *info, void *data) { struct cache_domain *c = data; - struct core_placement place; + struct obj_placement place; + struct common_obj_data *asign; if (!info->moved) return; @@ -180,14 +108,17 @@ static void place_core(struct irq_info *info, void *data) place.info = info; place.best = NULL; + place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_cpu_core(c->cpu_cores, place_irq_in_core, &place); + for_each_cpu_core(c->cpu_cores, find_best_object, &place); - if (place.best) { - migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info); - info->assigned_obj = (struct common_obj_data *)place.best; - place.best->common.workload += info->workload + 1; + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&c->common.interrupts, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; } } @@ -199,29 +130,11 @@ static void place_cores(struct common_obj_data *d, void *data __attribute__((unu for_each_irq(cache_domain->common.interrupts, place_core, cache_domain); } -struct package_placement { - struct irq_info *info; - struct package *best; - uint64_t best_cost; -}; - -static void find_best_package(struct common_obj_data *d, void *data) -{ - struct package *p = (struct package *)d; - uint64_t newload; - struct package_placement *place = data; - - newload = p->common.workload + package_cost_func(place->info, p); - if (newload < place->best_cost) { - place->best = p; - place->best_cost = newload; - } -} - static void place_irq_in_package(struct irq_info *info, void *data) { - struct package_placement place; + struct obj_placement place; struct numa_node *n = data; + struct common_obj_data *asign; if (!info->moved) return; @@ -229,17 +142,19 @@ static void place_irq_in_package(struct irq_info *info, void *data) if (info->level == BALANCE_NONE) return; - place.best_cost = INT_MAX; - place.best = NULL; place.info = info; + place.best = NULL; + place.least_irqs = NULL; + place.best_cost = INT_MAX; - for_each_package(n->packages, find_best_package, &place); + for_each_package(n->packages, find_best_object, &place); - if (place.best) { - migrate_irq(&n->common.interrupts, &place.best->common.interrupts, info); - info->assigned_obj = (struct common_obj_data *)place.best; - place.best->common.workload += info->workload + 1; - place.best->class_count[info->class]++; + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&n->common.interrupts, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; } } @@ -250,29 +165,10 @@ static void place_packages(struct common_obj_data *d, void *data __attribute__(( for_each_irq(n->common.interrupts, place_irq_in_package, n); } -struct node_placement { - struct irq_info *info; - struct numa_node *best; - uint64_t best_cost; -}; - -static void find_best_node(struct common_obj_data *d, void *data) -{ - struct numa_node *n = (struct numa_node *)d; - struct node_placement *place = data; - - /* - * Just find the least loaded node - */ - if (n->common.workload < place->best_cost) { - place->best = n; - place->best_cost = n->common.workload; - } -} - static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) { - struct node_placement place; + struct obj_placement place; + struct common_obj_data *asign; if( info->level == BALANCE_NONE) return; @@ -284,20 +180,23 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u */ migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info); info->assigned_obj = (struct common_obj_data *)irq_numa_node(info); - irq_numa_node(info)->common.workload += info->workload + 1; + irq_numa_node(info)->common.load += info->load + 1; return; } place.best_cost = INT_MAX; place.best = NULL; + place.least_irqs = NULL; place.info = info; - for_each_numa_node(NULL, find_best_node, &place); + for_each_numa_node(NULL, find_best_object, &place); - if (place.best) { - migrate_irq(&rebalance_irq_list, &place.best->common.interrupts, info); - info->assigned_obj = (struct common_obj_data *)place.best; - place.best->common.workload += info->workload + 1; + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&rebalance_irq_list, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; } } diff --git a/procinterrupts.c b/procinterrupts.c index 20c5551..0bdef18 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -93,7 +93,8 @@ void parse_proc_interrupts(void) } if (cpunr != core_count) need_cpu_rescan = 1; - + + info->last_irq_count = info->irq_count; info->irq_count = count; /* is interrupt MSI based? */ @@ -135,7 +136,7 @@ static void compute_irq_load_share(struct common_obj_data *d, void *data __attri for_each_irq(cpu->common.interrupts, accumulate_irq_count, &total_irq_counts); - load_slice = cpu->common.load / total_irq_counts; + load_slice = total_irq_counts ? (cpu->common.load / total_irq_counts) : 1; for_each_irq(cpu->common.interrupts, assign_load_slice, &load_slice); } diff --git a/types.h b/types.h index f380d04..3c13759 100644 --- a/types.h +++ b/types.h @@ -28,7 +28,6 @@ struct common_obj_data { - uint64_t workload; uint64_t load; int number; cpumask_t mask; @@ -43,7 +42,6 @@ struct numa_node { struct package { struct common_obj_data common; struct numa_node *numa_node; - int class_count[7]; GList *cache_domains; }; @@ -51,7 +49,6 @@ struct cache_domain { struct common_obj_data common; int marker; struct package *package; - int class_count[7]; GList *cpu_cores; }; @@ -60,7 +57,6 @@ struct cpu_core { struct common_obj_data common; int marker; struct cache_domain *cache_domain; - int class_count[7]; uint64_t irq_load; uint64_t softirq_load; }; @@ -75,7 +71,6 @@ struct irq_info { cpumask_t affinity_hint; uint64_t irq_count; uint64_t last_irq_count; - uint64_t workload; uint64_t load; int moved; struct common_obj_data *assigned_obj; From 80157dd69a6e6c6ebf00a662b82c09021adc8a6a Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Thu, 6 Oct 2011 11:57:45 -0400 Subject: [PATCH 25/44] Fix a crash resulting from trying to parse a misread cpumask. If we don't read a cpumask from the local_cpus sysfs file, we shouldn't try to parse it Signed-off-by: Petr Holasek Signed-off-by: Neil Horman --- classify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classify.c b/classify.c index be3efa5..14c17d4 100644 --- a/classify.c +++ b/classify.c @@ -154,7 +154,7 @@ assign_node: lcpu_mask = NULL; rc = fscanf(fd, "%as", &lcpu_mask); fclose(fd); - if (!lcpu_mask) { + if (!lcpu_mask || !rc) { cpus_setall(new->cpumask); } else { cpumask_parse_user(lcpu_mask, strlen(lcpu_mask), From 3252189949762a1fd85ad19b21f6393a7ab93280 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 6 Oct 2011 13:53:07 -0400 Subject: [PATCH 26/44] Add back improved affinity_hint handling The new balancer can now deal with affintiy hinting again, this time in a reasonably sane manner. Whereas before having an affintiy hint caused irqbalance to just assign that hint as the affinity, we now have a policy based operation, controlled by the hintpolicy option. The policy can be one of: exact - affinity_hint is applied for that irq without balancing consideration subset - balancing takes place, but assigned affinity will be a subset of the hint ignore - affinity_hint is ignored entirely --- activate.c | 9 ++++++++- classify.c | 2 -- irqbalance.1 | 13 +++++++++++++ irqbalance.c | 21 ++++++++++++++++----- irqbalance.h | 18 ++++++++++++------ irqlist.c | 7 +++++++ placement.c | 14 ++++++++++++++ types.h | 1 - 8 files changed, 70 insertions(+), 15 deletions(-) diff --git a/activate.c b/activate.c index b201e9b..292c44a 100644 --- a/activate.c +++ b/activate.c @@ -36,6 +36,7 @@ static void activate_mapping(struct irq_info *info, void *data __attribute__((un { char buf[PATH_MAX]; FILE *file; + cpumask_t applied_mask; /* * only activate mappings for irqs that have moved @@ -52,7 +53,13 @@ static void activate_mapping(struct irq_info *info, void *data __attribute__((un if (!file) return; - cpumask_scnprintf(buf, PATH_MAX, info->assigned_obj->mask); + if ((hint_policy == HINT_POLICY_EXACT) && + (!cpus_empty(info->affinity_hint))) + applied_mask = info->affinity_hint; + else + applied_mask = info->assigned_obj->mask; + + cpumask_scnprintf(buf, PATH_MAX, applied_mask); fprintf(file, "%s", buf); fclose(file); info->moved = 0; /*migration is done*/ diff --git a/classify.c b/classify.c index 14c17d4..0a4d46c 100644 --- a/classify.c +++ b/classify.c @@ -25,8 +25,6 @@ int map_class_to_level[7] = { BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CACHE, BALANCE_NONE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE }; -int class_counts[7]; - #define MAX_CLASS 0x12 /* * Class codes lifted from pci spec, appendix D. diff --git a/irqbalance.1 b/irqbalance.1 index c26c709..8fbc48d 100755 --- a/irqbalance.1 +++ b/irqbalance.1 @@ -41,6 +41,19 @@ Causes irqbalance to be run once, after which the daemon exits .B --debug Causes irqbalance to run in the foreground and extra debug information to be printed +.TP +.B --hintpolicy=[exact | subset | ignore] +Set the policy for how irq kernel affinity hinting is treated. Can be one of: +.P +.I exact +irq affinity hint is applied unilaterally and never violated +.P +.I subset +irq is balanced, but the assigned object will be a subset of the affintiy hint +.P +.I ignore +irq affinity hint value is completely ignored + .SH "ENVIRONMENT VARIABLES" .TP .B IRQBALANCE_ONESHOT diff --git a/irqbalance.c b/irqbalance.c index 6607d38..9f08bce 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -38,12 +38,10 @@ int one_shot_mode; int debug_mode; int numa_avail; - int need_cpu_rescan; - extern cpumask_t banned_cpus; - static int counter; +enum hp_e hint_policy = HINT_POLICY_SUBSET; void sleep_approx(int seconds) @@ -64,12 +62,13 @@ void sleep_approx(int seconds) struct option lopts[] = { {"oneshot", 0, NULL, 'o'}, {"debug", 0, NULL, 'd'}, + {"hintpolicy", 1, NULL, 'h'}, {0, 0, 0, 0} }; static void usage(void) { - printf("irqbalance [--oneshot | -o] [--debug | -d]"); + printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]"); } static void parse_command_line(int argc, char **argv) @@ -78,7 +77,7 @@ static void parse_command_line(int argc, char **argv) int longind; while ((opt = getopt_long(argc, argv, - "", + "odh:", lopts, &longind)) != -1) { switch(opt) { @@ -88,6 +87,18 @@ static void parse_command_line(int argc, char **argv) case 'd': debug_mode=1; break; + case 'h': + if (!strncmp(optarg, "exact", strlen(optarg))) + hint_policy = HINT_POLICY_EXACT; + else if (!strncmp(optarg, "subset", strlen(optarg))) + hint_policy = HINT_POLICY_SUBSET; + else if (!strncmp(optarg, "ignore", strlen(optarg))) + hint_policy = HINT_POLICY_IGNORE; + else { + usage(); + exit(1); + } + break; case 'o': one_shot_mode=1; break; diff --git a/irqbalance.h b/irqbalance.h index 1321219..e4ebf95 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -16,12 +16,6 @@ extern int package_count; extern int cache_domain_count; extern int core_count; extern char *classes[]; -extern int map_class_to_level[7]; -extern int class_counts[7]; -extern int debug_mode; -extern int power_mode; -extern int need_cpu_rescan; -extern int one_shot_mode; extern void parse_cpu_tree(void); extern void clear_work_stats(void); @@ -46,6 +40,18 @@ void clear_cpu_tree(void); void pci_numa_scan(void); /*===================NEW BALANCER FUNCTIONS============================*/ +enum hp_e { + HINT_POLICY_IGNORE, + HINT_POLICY_SUBSET, + HINT_POLICY_EXACT +}; + +extern int debug_mode; +extern int one_shot_mode; +extern int power_mode; +extern int need_cpu_rescan; +extern enum hp_e hint_policy; + /* * Numa node access routines */ diff --git a/irqlist.c b/irqlist.c index 462f1b8..898b64d 100644 --- a/irqlist.c +++ b/irqlist.c @@ -68,6 +68,13 @@ static void move_candidate_irqs(struct irq_info *info, void *data) { int *remaining_deviation = (int *)data; + /* never move an irq that has an afinity hint when + * hint_policy is HINT_POLICY_EXACT + */ + if (hint_policy == HINT_POLICY_EXACT) + if (!cpus_empty(info->affinity_hint)) + return; + /* Don't rebalance irqs that don't want it */ if (info->level == BALANCE_NONE) return; diff --git a/placement.c b/placement.c index 8f5623a..835226a 100644 --- a/placement.c +++ b/placement.c @@ -43,6 +43,20 @@ static void find_best_object(struct common_obj_data *d, void *data) { struct obj_placement *best = (struct obj_placement *)data; uint64_t newload; + cpumask_t subset; + + /* + * If the hint policy is subset, then we only want + * to consider objects that are within the irqs hint, but + * only if that irq in fact has published a hint + */ + if (hint_policy == HINT_POLICY_SUBSET) { + if (!cpus_empty(best->info->affinity_hint)) { + cpus_and(subset, best->info->affinity_hint, d->mask); + if (cpus_empty(subset)) + return; + } + } newload = d->load; if (newload < best->best_cost) { diff --git a/types.h b/types.h index 3c13759..4fad7bb 100644 --- a/types.h +++ b/types.h @@ -26,7 +26,6 @@ #define IRQ_TYPE_MSI 1 #define IRQ_TYPE_MSIX 2 - struct common_obj_data { uint64_t load; int number; From 714356689eccdfe79fba71ab05970bea0cee6e67 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 7 Oct 2011 19:42:43 -0400 Subject: [PATCH 27/44] Fix some minor errors: 1) Correct computation of standard deviation (devide by n-1 rather than n) 2) Fix irq migration to only consider irqs assigned to object that is overloaded 3) Filter initial forced rebalancing to leave BALANCE_NONE irqs alone Signed-off-by: Neil Horman --- cputree.c | 1 + irqbalance.c | 4 ++++ irqlist.c | 7 ++++--- placement.c | 3 +-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cputree.c b/cputree.c index 2d7787c..5dc1085 100644 --- a/cputree.c +++ b/cputree.c @@ -116,6 +116,7 @@ static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, if (!cache) return NULL; cache->common.mask = cache_mask; + cache->common.number = cache_domain_count; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; } diff --git a/irqbalance.c b/irqbalance.c index 9f08bce..645adaa 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -140,6 +140,9 @@ static void dump_object_tree() static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) { + if (info->level == BALANCE_NONE) + return; + migrate_irq((info->assigned_obj ? &info->assigned_obj->interrupts : NULL), &rebalance_irq_list, info); info->assigned_obj = NULL; @@ -241,6 +244,7 @@ int main(int argc, char** argv) dump_tree(); if (one_shot_mode) break; + clear_work_stats(); counter++; } diff --git a/irqlist.c b/irqlist.c index 898b64d..ba866d5 100644 --- a/irqlist.c +++ b/irqlist.c @@ -107,7 +107,6 @@ static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data) deviation = obj->load - info->avg_load; - if ((deviation > info->std_deviation) && (g_list_length(obj->interrupts) > 1)) { /* @@ -124,17 +123,19 @@ static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data) * and migrate irqs until we only have one left, or until that * difference reaches zero */ - for_each_irq(NULL, move_candidate_irqs, &deviation); + for_each_irq(obj->interrupts, move_candidate_irqs, &deviation); } } #define find_overloaded_objs(name, info) do {\ + int ___load_sources;\ memset(&(info), 0, sizeof(struct load_balance_info));\ for_each_##name(NULL, gather_load_stats, &(info));\ (info).avg_load = (info).total_load / (info).load_sources;\ for_each_##name(NULL, compute_deviations, &(info));\ - (info).std_deviation = (long double)((info).deviations / ((info).load_sources));\ + ___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\ + (info).std_deviation = (long double)((info).deviations / ___load_sources);\ (info).std_deviation = sqrt((info).std_deviation);\ for_each_##name(NULL, migrate_overloaded_irqs, &(info));\ }while(0) diff --git a/placement.c b/placement.c index 835226a..a7d1e17 100644 --- a/placement.c +++ b/placement.c @@ -216,9 +216,8 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u static void validate_irq(struct irq_info *info, void *data) { - printf("Validating irq %d %p against %p\n", info->irq, info->assigned_obj, data); if (info->assigned_obj != data) - printf("irq %d is wrong, points to %p, should be %p\n", + printf("object validation error: irq %d is wrong, points to %p, should be %p\n", info->irq, info->assigned_obj, data); } From 171bbdce01f93eea7656e991cfc799ed4fe40c27 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 08:46:12 -0400 Subject: [PATCH 28/44] Fix a crash on topology change Rebuilding the topology and irq databases resulted in some bogus manipulation of a glib list. Apparently, g_list_free[_full] doesn't leave the list pointer you're using in a valid state and we have to re-null it. Signed-off-by: Neil Horman --- classify.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/classify.c b/classify.c index 0a4d46c..49212d4 100644 --- a/classify.c +++ b/classify.c @@ -63,11 +63,6 @@ static gint compare_ints(gconstpointer a, gconstpointer b) return ai->irq - bi->irq; } -static void free_int(gpointer data) -{ - free(data); -} - /* * Inserts an irq_info struct into the intterupts_db list * devpath points to the device directory in sysfs for the @@ -241,9 +236,16 @@ done: return; } +static void free_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + free(info); +} + void free_irq_db(void) { - g_list_free_full(interrupts_db, free_int); + for_each_irq(NULL, free_irq, NULL); + g_list_free(interrupts_db); + interrupts_db = NULL; } void rebuild_irq_db(void) @@ -251,7 +253,7 @@ void rebuild_irq_db(void) DIR *devdir = opendir(SYSDEV_DIR); struct dirent *entry; - g_list_free_full(interrupts_db, free_int); + free_irq_db(); if (!devdir) return; From c739059a6171d3536bdb92bcde3657eed28f6b5d Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 08:54:54 -0400 Subject: [PATCH 29/44] Add configure.ac checks to ensure presence of libnuma and libm irqbalance currently requries libnuma and libm be present. Augment the configure script to ensure they are available Signed-off-by: Neil Horman --- configure.ac | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configure.ac b/configure.ac index 523b2ea..fd0385f 100644 --- a/configure.ac +++ b/configure.ac @@ -55,6 +55,8 @@ AC_CHECK_HEADERS(linux/ethtool.h linux/sockios.h, [], []) AC_CHECK_FUNCS(getopt_long) +AC_CHECK_LIB(numa, numa_run_on_node, [], []) +AC_CHECK_LIB(m, floor, [], []) AC_C_CONST AC_C_INLINE From c10d9540e81b4a8a86b3035f98e9531f72bbda1e Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 09:02:44 -0400 Subject: [PATCH 30/44] Clean up some unused data members Some of our data structures had dangling unused fields. Get rid of them --- types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/types.h b/types.h index 4fad7bb..5b9c68c 100644 --- a/types.h +++ b/types.h @@ -46,7 +46,6 @@ struct package { struct cache_domain { struct common_obj_data common; - int marker; struct package *package; GList *cpu_cores; }; @@ -54,7 +53,6 @@ struct cache_domain { struct cpu_core { struct common_obj_data common; - int marker; struct cache_domain *cache_domain; uint64_t irq_load; uint64_t softirq_load; From decce934bcb3daebbc76de5155eac0c1baff7127 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 09:20:33 -0400 Subject: [PATCH 31/44] Merge all topology objects to a common structure Theres no need to treat topology objects differently. We can merge them all down to a common structure. This will make balancing code a great deal more concise. --- cputree.c | 152 ++++++++++++++++++++++------------------------- irqbalance.h | 12 ++-- irqlist.c | 2 +- numa.c | 56 +++++++++-------- placement.c | 39 ++++++------ procinterrupts.c | 19 +++--- types.h | 29 +-------- 7 files changed, 135 insertions(+), 174 deletions(-) diff --git a/cputree.c b/cputree.c index 5dc1085..ead576f 100644 --- a/cputree.c +++ b/cputree.c @@ -55,32 +55,32 @@ cpumask_t cpu_possible_map; */ static cpumask_t unbanned_cpus; -static struct package* add_cache_domain_to_package(struct cache_domain *cache, +static struct common_obj_data* add_cache_domain_to_package(struct common_obj_data *cache, cpumask_t package_mask) { GList *entry; - struct package *package; - struct cache_domain *lcache; + struct common_obj_data *package; + struct common_obj_data *lcache; entry = g_list_first(packages); while (entry) { package = entry->data; - if (cpus_equal(package_mask, package->common.mask)) + if (cpus_equal(package_mask, package->mask)) break; entry = g_list_next(entry); } if (!entry) { - package = calloc(sizeof(struct package), 1); + package = calloc(sizeof(struct common_obj_data), 1); if (!package) return NULL; - package->common.mask = package_mask; + package->mask = package_mask; packages = g_list_append(packages, package); package_count++; } - entry = g_list_first(package->cache_domains); + entry = g_list_first(package->children); while (entry) { lcache = entry->data; if (lcache == cache) @@ -89,39 +89,39 @@ static struct package* add_cache_domain_to_package(struct cache_domain *cache, } if (!entry) { - package->cache_domains = g_list_append(package->cache_domains, cache); - cache->package = package; + package->children = g_list_append(package->children, cache); + cache->parent = package; } return package; } -static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, +static struct common_obj_data* add_cpu_to_cache_domain(struct common_obj_data *cpu, cpumask_t cache_mask) { GList *entry; - struct cache_domain *cache; - struct cpu_core *lcpu; + struct common_obj_data *cache; + struct common_obj_data *lcpu; entry = g_list_first(cache_domains); while (entry) { cache = entry->data; - if (cpus_equal(cache_mask, cache->common.mask)) + if (cpus_equal(cache_mask, cache->mask)) break; entry = g_list_next(entry); } if (!entry) { - cache = calloc(sizeof(struct cache_domain), 1); + cache = calloc(sizeof(struct common_obj_data), 1); if (!cache) return NULL; - cache->common.mask = cache_mask; - cache->common.number = cache_domain_count; + cache->mask = cache_mask; + cache->number = cache_domain_count; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; } - entry = g_list_first(cache->cpu_cores); + entry = g_list_first(cache->children); while (entry) { lcpu = entry->data; if (lcpu == cpu) @@ -130,8 +130,8 @@ static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, } if (!entry) { - cache->cpu_cores = g_list_append(cache->cpu_cores, cpu); - cpu->cache_domain = cache; + cache->children = g_list_append(cache->children, cpu); + cpu->parent = (struct common_obj_data *)cache; } return cache; @@ -139,12 +139,12 @@ static struct cache_domain* add_cpu_to_cache_domain(struct cpu_core *cpu, static void do_one_cpu(char *path) { - struct cpu_core *cpu; + struct common_obj_data *cpu; FILE *file; char new_path[PATH_MAX]; cpumask_t cache_mask, package_mask; - struct cache_domain *cache; - struct package *package; + struct common_obj_data *cache; + struct common_obj_data *package; DIR *dir; struct dirent *entry; int nodeid; @@ -165,19 +165,18 @@ static void do_one_cpu(char *path) free(line); } - cpu = malloc(sizeof(struct cpu_core)); + cpu = calloc(sizeof(struct common_obj_data), 1); if (!cpu) return; - memset(cpu, 0, sizeof(struct cpu_core)); - cpu->common.number = strtoul(&path[27], NULL, 10); + cpu->number = strtoul(&path[27], NULL, 10); - cpu_set(cpu->common.number, cpu_possible_map); + cpu_set(cpu->number, cpu_possible_map); - cpu_set(cpu->common.number, cpu->common.mask); + cpu_set(cpu->number, cpu->mask); /* if the cpu is on the banned list, just don't add it */ - if (cpus_intersects(cpu->common.mask, banned_cpus)) { + if (cpus_intersects(cpu->mask, banned_cpus)) { free(cpu); /* even though we don't use the cpu we do need to count it */ core_count++; @@ -188,7 +187,7 @@ static void do_one_cpu(char *path) /* try to read the package mask; if it doesn't exist assume solitary */ snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path); file = fopen(new_path, "r"); - cpu_set(cpu->common.number, package_mask); + cpu_set(cpu->number, package_mask); if (file) { char *line = NULL; size_t size = 0; @@ -200,7 +199,7 @@ static void do_one_cpu(char *path) /* try to read the cache mask; if it doesn't exist assume solitary */ /* We want the deepest cache level available so try index1 first, then index2 */ - cpu_set(cpu->common.number, cache_mask); + cpu_set(cpu->number, cache_mask); snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path); file = fopen(new_path, "r"); if (file) { @@ -243,9 +242,9 @@ static void do_one_cpu(char *path) blank out the banned cpus from the various masks so that interrupts will never be told to go there */ - cpus_and(cpu_cache_domain(cpu)->common.mask, cpu_cache_domain(cpu)->common.mask, unbanned_cpus); - cpus_and(cpu_package(cpu)->common.mask, cpu_package(cpu)->common.mask, unbanned_cpus); - cpus_and(cpu->common.mask, cpu->common.mask, unbanned_cpus); + cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus); + cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus); + cpus_and(cpu->mask, cpu->mask, unbanned_cpus); cpus = g_list_append(cpus, cpu); core_count++; @@ -256,39 +255,37 @@ static void dump_irq(struct irq_info *info, void *data) int spaces = (long int)data; int i; for (i=0; iirq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->load); + printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load); } -static void dump_cpu_core(struct common_obj_data *d, void *data __attribute__((unused))) +static void dump_common_obj_data(struct common_obj_data *d, void *data __attribute__((unused))) { - struct cpu_core *c = (struct cpu_core *)d; - printf(" CPU number %i numa_node is %d (load %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.load); - if (c->common.interrupts) - for_each_irq(c->common.interrupts, dump_irq, (void *)18); + struct common_obj_data *c = (struct common_obj_data *)d; + printf(" CPU number %i numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load); + if (c->interrupts) + for_each_irq(c->interrupts, dump_irq, (void *)18); } static void dump_cache_domain(struct common_obj_data *d, void *data) { - struct cache_domain *c = (struct cache_domain *)d; char *buffer = data; - cpumask_scnprintf(buffer, 4095, c->common.mask); - printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.load); - if (c->cpu_cores) - for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL); - if (c->common.interrupts) - for_each_irq(c->common.interrupts, dump_irq, (void *)10); + cpumask_scnprintf(buffer, 4095, d->mask); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load); + if (d->children) + for_each_cpu_core(d->children, dump_common_obj_data, NULL); + if (d->interrupts) + for_each_irq(d->interrupts, dump_irq, (void *)10); } static void dump_package(struct common_obj_data *d, void *data) { - struct package *p = (struct package *)d; char *buffer = data; - cpumask_scnprintf(buffer, 4096, p->common.mask); - printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.load); - if (p->cache_domains) - for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer); - if (p->common.interrupts) - for_each_irq(p->common.interrupts, dump_irq, (void *)2); + cpumask_scnprintf(buffer, 4096, d->mask); + printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load); + if (d->children) + for_each_cache_domain(d->children, dump_cache_domain, buffer); + if (d->interrupts) + for_each_irq(d->interrupts, dump_irq, (void *)2); } void dump_tree(void) @@ -299,31 +296,26 @@ void dump_tree(void) static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused))) { - struct cpu_core *c = (struct cpu_core *)d; - c->common.load = 0; - c->irq_load = 0; - c->softirq_load = 0; + struct common_obj_data *c = (struct common_obj_data *)d; + c->load = 0; } static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused))) { - struct cache_domain *c = (struct cache_domain *)d; - c->common.load = 0; - for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL); + d->load = 0; + for_each_cpu_core(d->children, clear_cpu_stats, NULL); } static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused))) { - struct package *p = (struct package *)d; - p->common.load = 0; - for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL); + d->load = 0; + for_each_cache_domain(d->children, clear_cd_stats, NULL); } static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused))) { - struct numa_node *n = (struct numa_node *)d; - n->common.load = 0; - for_each_package(n->packages, clear_package_stats, NULL); + d->load = 0; + for_each_package(d->children, clear_package_stats, NULL); } static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) @@ -382,15 +374,15 @@ void parse_cpu_tree(void) void clear_cpu_tree(void) { GList *item; - struct cpu_core *cpu; - struct cache_domain *cache_domain; - struct package *package; + struct common_obj_data *cpu; + struct common_obj_data *cache_domain; + struct common_obj_data *package; while (packages) { item = g_list_first(packages); package = item->data; - g_list_free(package->cache_domains); - g_list_free(package->common.interrupts); + g_list_free(package->children); + g_list_free(package->interrupts); free(package); packages = g_list_delete_link(packages, item); } @@ -399,8 +391,8 @@ void clear_cpu_tree(void) while (cache_domains) { item = g_list_first(cache_domains); cache_domain = item->data; - g_list_free(cache_domain->cpu_cores); - g_list_free(cache_domain->common.interrupts); + g_list_free(cache_domain->children); + g_list_free(cache_domain->interrupts); free(cache_domain); cache_domains = g_list_delete_link(cache_domains, item); } @@ -410,7 +402,7 @@ void clear_cpu_tree(void) while (cpus) { item = g_list_first(cpus); cpu = item->data; - g_list_free(cpu->common.interrupts); + g_list_free(cpu->interrupts); free(cpu); cpus = g_list_delete_link(cpus, item); } @@ -457,18 +449,18 @@ void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void * static gint compare_cpus(gconstpointer a, gconstpointer b) { - const struct cpu_core *ai = a; - const struct cpu_core *bi = b; + const struct common_obj_data *ai = a; + const struct common_obj_data *bi = b; - return ai->common.number - bi->common.number; + return ai->number - bi->number; } -struct cpu_core *find_cpu_core(int cpunr) +struct common_obj_data *find_cpu_core(int cpunr) { GList *entry; - struct cpu_core find; + struct common_obj_data find; - find.common.number = cpunr; + find.number = cpunr; entry = g_list_find_custom(cpus, &find, compare_cpus); return entry ? entry->data : NULL; diff --git a/irqbalance.h b/irqbalance.h index e4ebf95..c47f522 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -59,30 +59,30 @@ extern void build_numa_node_list(void); extern void free_numa_node_list(void); extern void dump_numa_node_info(struct common_obj_data *node, void *data); extern void for_each_numa_node(GList *list, void (*cb)(struct common_obj_data *node, void *data), void *data); -extern void add_package_to_node(struct package *p, int nodeid); -extern struct numa_node *get_numa_node(int nodeid); +extern void add_package_to_node(struct common_obj_data *p, int nodeid); +extern struct common_obj_data *get_numa_node(int nodeid); /* * Package functions */ -#define package_numa_node(p) ((p)->numa_node) +#define package_numa_node(p) ((p)->parent) extern void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *data), void *data); /* * cache_domain functions */ -#define cache_domain_package(c) ((c)->package) +#define cache_domain_package(c) ((c)->parent) #define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) extern void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); /* * cpu core functions */ -#define cpu_cache_domain(cpu) ((cpu)->cache_domain) +#define cpu_cache_domain(cpu) ((cpu)->parent) #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) extern void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); -extern struct cpu_core *find_cpu_core(int cpunr); +extern struct common_obj_data *find_cpu_core(int cpunr); extern int get_cpu_count(void); /* diff --git a/irqlist.c b/irqlist.c index ba866d5..e5d93ca 100644 --- a/irqlist.c +++ b/irqlist.c @@ -165,7 +165,7 @@ void reset_counts(void) static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) { - printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->load); + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load); } void dump_workloads(void) diff --git a/numa.c b/numa.c index ce3eb67..6afa119 100644 --- a/numa.c +++ b/numa.c @@ -37,43 +37,42 @@ GList *numa_nodes = NULL; -struct numa_node unspecified_node = { - .common = { - .load = 0, - .number = -1, - .mask = CPU_MASK_ALL, - .interrupts = NULL, - }, - .packages = NULL, +struct common_obj_data unspecified_node = { + .load = 0, + .number = -1, + .mask = CPU_MASK_ALL, + .interrupts = NULL, + .children = NULL, + .parent = NULL, }; static void add_one_node(const char *nodename) { char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1); - struct numa_node *new; + struct common_obj_data *new; char *cpustr; FILE *f; if (!path) return; - new = calloc(1, sizeof(struct numa_node)); + new = calloc(1, sizeof(struct common_obj_data)); if (!new) return; sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); f = fopen(path, "r"); if (ferror(f)) { - cpus_clear(new->common.mask); + cpus_clear(new->mask); } else { fscanf(f, "%as", &cpustr); if (!cpustr) { - cpus_clear(new->common.mask); + cpus_clear(new->mask); } else { - cpumask_parse_user(cpustr, strlen(cpustr), new->common.mask); + cpumask_parse_user(cpustr, strlen(cpustr), new->mask); free(cpustr); } } - new->common.number = strtoul(&nodename[4], NULL, 10); + new->number = strtoul(&nodename[4], NULL, 10); numa_nodes = g_list_append(numa_nodes, new); } @@ -105,19 +104,19 @@ void free_numa_node_list(void) static gint compare_node(gconstpointer a, gconstpointer b) { - const struct numa_node *ai = a; - const struct numa_node *bi = b; + const struct common_obj_data *ai = a; + const struct common_obj_data *bi = b; - return (ai->common.number == bi->common.number) ? 0 : 1; + return (ai->number == bi->number) ? 0 : 1; } -void add_package_to_node(struct package *p, int nodeid) +void add_package_to_node(struct common_obj_data *p, int nodeid) { - struct numa_node find, *node; - find.common.number = nodeid; + struct common_obj_data find, *node; + find.number = nodeid; GList *entry; - find.common.number = nodeid; + find.number = nodeid; entry = g_list_find_custom(numa_nodes, &find, compare_node); if (!entry) { @@ -128,17 +127,16 @@ void add_package_to_node(struct package *p, int nodeid) node = entry->data; - node->packages = g_list_append(node->packages, p); - p->numa_node = node; + node->children = g_list_append(node->children, p); + p->parent = node; } void dump_numa_node_info(struct common_obj_data *d, void *unused __attribute__((unused))) { - struct numa_node *node = (struct numa_node *)d; char buffer[4096]; - printf("NUMA NODE NUMBER: %d\n", node->common.number); - cpumask_scnprintf(buffer, 4096, node->common.mask); + printf("NUMA NODE NUMBER: %d\n", d->number); + cpumask_scnprintf(buffer, 4096, d->mask); printf("LOCAL CPU MASK: %s\n", buffer); printf("\n"); } @@ -156,15 +154,15 @@ void for_each_numa_node(GList *list, void(*cb)(struct common_obj_data *node, voi } } -struct numa_node *get_numa_node(int nodeid) +struct common_obj_data *get_numa_node(int nodeid) { - struct numa_node find; + struct common_obj_data find; GList *entry; if (nodeid == -1) return &unspecified_node; - find.common.number = nodeid; + find.number = nodeid; entry = g_list_find_custom(numa_nodes, &find, compare_node); return entry ? entry->data : NULL; diff --git a/placement.c b/placement.c index a7d1e17..6680456 100644 --- a/placement.c +++ b/placement.c @@ -73,7 +73,7 @@ static void find_best_object(struct common_obj_data *d, void *data) static void place_irq_in_cache_domain(struct irq_info *info, void *data) { - struct package *p = data; + struct common_obj_data *p = data; struct obj_placement place; struct common_obj_data *asign; @@ -89,12 +89,12 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_cache_domain(p->cache_domains, find_best_object, &place); + for_each_cache_domain(p->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; if (asign) { - migrate_irq(&p->common.interrupts, &asign->interrupts, info); + migrate_irq(&p->interrupts, &asign->interrupts, info); info->assigned_obj = asign; } @@ -102,14 +102,13 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) static void place_cache_domain(struct common_obj_data *d, void *data __attribute__((unused))) { - struct package *package = (struct package *)d; - if (package->common.interrupts) - for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package); + if (d->interrupts) + for_each_irq(d->interrupts, place_irq_in_cache_domain, d); } static void place_core(struct irq_info *info, void *data) { - struct cache_domain *c = data; + struct common_obj_data *c = data; struct obj_placement place; struct common_obj_data *asign; @@ -125,12 +124,12 @@ static void place_core(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_cpu_core(c->cpu_cores, find_best_object, &place); + for_each_cpu_core(c->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; if (asign) { - migrate_irq(&c->common.interrupts, &asign->interrupts, info); + migrate_irq(&c->interrupts, &asign->interrupts, info); info->assigned_obj = asign; asign->load += info->load; } @@ -139,15 +138,14 @@ static void place_core(struct irq_info *info, void *data) static void place_cores(struct common_obj_data *d, void *data __attribute__((unused))) { - struct cache_domain *cache_domain = (struct cache_domain *)d; - if (cache_domain->common.interrupts) - for_each_irq(cache_domain->common.interrupts, place_core, cache_domain); + if (d->interrupts) + for_each_irq(d->interrupts, place_core, d); } static void place_irq_in_package(struct irq_info *info, void *data) { struct obj_placement place; - struct numa_node *n = data; + struct common_obj_data *n = data; struct common_obj_data *asign; if (!info->moved) @@ -161,12 +159,12 @@ static void place_irq_in_package(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_package(n->packages, find_best_object, &place); + for_each_package(n->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; if (asign) { - migrate_irq(&n->common.interrupts, &asign->interrupts, info); + migrate_irq(&n->interrupts, &asign->interrupts, info); info->assigned_obj = asign; asign->load += info->load; } @@ -174,9 +172,8 @@ static void place_irq_in_package(struct irq_info *info, void *data) static void place_packages(struct common_obj_data *d, void *data __attribute__((unused))) { - struct numa_node *n = (struct numa_node *)d; - if (n->common.interrupts) - for_each_irq(n->common.interrupts, place_irq_in_package, n); + if (d->interrupts) + for_each_irq(d->interrupts, place_irq_in_package, d); } static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) @@ -187,14 +184,14 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u if( info->level == BALANCE_NONE) return; - if (irq_numa_node(info)->common.number != -1) { + if (irq_numa_node(info)->number != -1) { /* * This irq belongs to a device with a preferred numa node * put it on that node */ - migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info); + migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info); info->assigned_obj = (struct common_obj_data *)irq_numa_node(info); - irq_numa_node(info)->common.load += info->load + 1; + irq_numa_node(info)->load += info->load + 1; return; } diff --git a/procinterrupts.c b/procinterrupts.c index 0bdef18..d004c96 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -130,15 +130,14 @@ static void assign_load_slice(struct irq_info *info, void *data) static void compute_irq_load_share(struct common_obj_data *d, void *data __attribute__((unused))) { - struct cpu_core *cpu = (struct cpu_core *)d; uint64_t total_irq_counts = 0; uint64_t load_slice; - for_each_irq(cpu->common.interrupts, accumulate_irq_count, &total_irq_counts); + for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_counts); - load_slice = total_irq_counts ? (cpu->common.load / total_irq_counts) : 1; + load_slice = total_irq_counts ? (d->load / total_irq_counts) : 1; - for_each_irq(cpu->common.interrupts, assign_load_slice, &load_slice); + for_each_irq(d->interrupts, assign_load_slice, &load_slice); } void parse_proc_stat() @@ -147,7 +146,7 @@ void parse_proc_stat() char *line = NULL; size_t size = 0; int cpunr, rc, cpucount; - struct cpu_core *cpu; + struct common_obj_data *cpu; int irq_load, softirq_load; file = fopen("/proc/stat", "r"); @@ -189,12 +188,10 @@ void parse_proc_stat() * For each cpu add the irq and softirq load and propagate that * all the way up the device tree */ - cpu->irq_load = irq_load; - cpu->softirq_load = softirq_load; - cpu->common.load = irq_load + softirq_load; - cpu->cache_domain->common.load += cpu->common.load; - cpu->cache_domain->package->common.load += cpu->common.load; - cpu->cache_domain->package->numa_node->common.load += cpu->common.load; + cpu->load = irq_load + softirq_load; + cpu_cache_domain(cpu)->load += cpu->load; + cpu_package(cpu)->load += cpu->load; + cpu_numa_node(cpu)->load += cpu->load; } fclose(file); diff --git a/types.h b/types.h index 5b9c68c..0898497 100644 --- a/types.h +++ b/types.h @@ -31,31 +31,8 @@ struct common_obj_data { int number; cpumask_t mask; GList *interrupts; -}; - -struct numa_node { - struct common_obj_data common; - GList *packages; -}; - -struct package { - struct common_obj_data common; - struct numa_node *numa_node; - GList *cache_domains; -}; - -struct cache_domain { - struct common_obj_data common; - struct package *package; - GList *cpu_cores; -}; - - -struct cpu_core { - struct common_obj_data common; - struct cache_domain *cache_domain; - uint64_t irq_load; - uint64_t softirq_load; + struct common_obj_data *parent; + GList *children; }; struct irq_info { @@ -63,7 +40,7 @@ struct irq_info { int class; int type; int level; - struct numa_node *numa_node; + struct common_obj_data *numa_node; cpumask_t cpumask; cpumask_t affinity_hint; uint64_t irq_count; From 587ba2f4dd04b6f62f2324a3dcffa0ee6175ac15 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 11:27:48 -0400 Subject: [PATCH 32/44] Rename common_obj_data to topo_obj Since consolodating the topology objects to a single structure, it seems Better to rename it to something more descriptive --- cputree.c | 66 ++++++++++++++++++++++++------------------------ irqbalance.h | 16 ++++++------ irqlist.c | 6 ++--- numa.c | 22 ++++++++-------- placement.c | 30 +++++++++++----------- procinterrupts.c | 4 +-- types.h | 8 +++--- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/cputree.c b/cputree.c index ead576f..e1e2711 100644 --- a/cputree.c +++ b/cputree.c @@ -55,12 +55,12 @@ cpumask_t cpu_possible_map; */ static cpumask_t unbanned_cpus; -static struct common_obj_data* add_cache_domain_to_package(struct common_obj_data *cache, +static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, cpumask_t package_mask) { GList *entry; - struct common_obj_data *package; - struct common_obj_data *lcache; + struct topo_obj *package; + struct topo_obj *lcache; entry = g_list_first(packages); @@ -72,7 +72,7 @@ static struct common_obj_data* add_cache_domain_to_package(struct common_obj_dat } if (!entry) { - package = calloc(sizeof(struct common_obj_data), 1); + package = calloc(sizeof(struct topo_obj), 1); if (!package) return NULL; package->mask = package_mask; @@ -95,12 +95,12 @@ static struct common_obj_data* add_cache_domain_to_package(struct common_obj_dat return package; } -static struct common_obj_data* add_cpu_to_cache_domain(struct common_obj_data *cpu, +static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, cpumask_t cache_mask) { GList *entry; - struct common_obj_data *cache; - struct common_obj_data *lcpu; + struct topo_obj *cache; + struct topo_obj *lcpu; entry = g_list_first(cache_domains); @@ -112,7 +112,7 @@ static struct common_obj_data* add_cpu_to_cache_domain(struct common_obj_data *c } if (!entry) { - cache = calloc(sizeof(struct common_obj_data), 1); + cache = calloc(sizeof(struct topo_obj), 1); if (!cache) return NULL; cache->mask = cache_mask; @@ -131,7 +131,7 @@ static struct common_obj_data* add_cpu_to_cache_domain(struct common_obj_data *c if (!entry) { cache->children = g_list_append(cache->children, cpu); - cpu->parent = (struct common_obj_data *)cache; + cpu->parent = (struct topo_obj *)cache; } return cache; @@ -139,12 +139,12 @@ static struct common_obj_data* add_cpu_to_cache_domain(struct common_obj_data *c static void do_one_cpu(char *path) { - struct common_obj_data *cpu; + struct topo_obj *cpu; FILE *file; char new_path[PATH_MAX]; cpumask_t cache_mask, package_mask; - struct common_obj_data *cache; - struct common_obj_data *package; + struct topo_obj *cache; + struct topo_obj *package; DIR *dir; struct dirent *entry; int nodeid; @@ -165,7 +165,7 @@ static void do_one_cpu(char *path) free(line); } - cpu = calloc(sizeof(struct common_obj_data), 1); + cpu = calloc(sizeof(struct topo_obj), 1); if (!cpu) return; @@ -258,26 +258,26 @@ static void dump_irq(struct irq_info *info, void *data) printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load); } -static void dump_common_obj_data(struct common_obj_data *d, void *data __attribute__((unused))) +static void dump_topo_obj(struct topo_obj *d, void *data __attribute__((unused))) { - struct common_obj_data *c = (struct common_obj_data *)d; + struct topo_obj *c = (struct topo_obj *)d; printf(" CPU number %i numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load); if (c->interrupts) for_each_irq(c->interrupts, dump_irq, (void *)18); } -static void dump_cache_domain(struct common_obj_data *d, void *data) +static void dump_cache_domain(struct topo_obj *d, void *data) { char *buffer = data; cpumask_scnprintf(buffer, 4095, d->mask); printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load); if (d->children) - for_each_cpu_core(d->children, dump_common_obj_data, NULL); + for_each_cpu_core(d->children, dump_topo_obj, NULL); if (d->interrupts) for_each_irq(d->interrupts, dump_irq, (void *)10); } -static void dump_package(struct common_obj_data *d, void *data) +static void dump_package(struct topo_obj *d, void *data) { char *buffer = data; cpumask_scnprintf(buffer, 4096, d->mask); @@ -294,25 +294,25 @@ void dump_tree(void) for_each_package(NULL, dump_package, buffer); } -static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused))) +static void clear_cpu_stats(struct topo_obj *d, void *data __attribute__((unused))) { - struct common_obj_data *c = (struct common_obj_data *)d; + struct topo_obj *c = (struct topo_obj *)d; c->load = 0; } -static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused))) +static void clear_cd_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; for_each_cpu_core(d->children, clear_cpu_stats, NULL); } -static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused))) +static void clear_package_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; for_each_cache_domain(d->children, clear_cd_stats, NULL); } -static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused))) +static void clear_node_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; for_each_package(d->children, clear_package_stats, NULL); @@ -374,9 +374,9 @@ void parse_cpu_tree(void) void clear_cpu_tree(void) { GList *item; - struct common_obj_data *cpu; - struct common_obj_data *cache_domain; - struct common_obj_data *package; + struct topo_obj *cpu; + struct topo_obj *cache_domain; + struct topo_obj *package; while (packages) { item = g_list_first(packages); @@ -411,7 +411,7 @@ void clear_cpu_tree(void) } -void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *data), void *data) +void for_each_package(GList *list, void (*cb)(struct topo_obj *p, void *data), void *data) { GList *entry = g_list_first(list ? list : packages); GList *next; @@ -423,7 +423,7 @@ void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *d } } -void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data) +void for_each_cache_domain(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data) { GList *entry = g_list_first(list ? list : cache_domains); GList *next; @@ -435,7 +435,7 @@ void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, vo } } -void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data) +void for_each_cpu_core(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data) { GList *entry = g_list_first(list ? list : cpus); GList *next; @@ -449,16 +449,16 @@ void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void * static gint compare_cpus(gconstpointer a, gconstpointer b) { - const struct common_obj_data *ai = a; - const struct common_obj_data *bi = b; + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; return ai->number - bi->number; } -struct common_obj_data *find_cpu_core(int cpunr) +struct topo_obj *find_cpu_core(int cpunr) { GList *entry; - struct common_obj_data find; + struct topo_obj find; find.number = cpunr; entry = g_list_find_custom(cpus, &find, compare_cpus); diff --git a/irqbalance.h b/irqbalance.h index c47f522..fbced39 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -57,23 +57,23 @@ extern enum hp_e hint_policy; */ extern void build_numa_node_list(void); extern void free_numa_node_list(void); -extern void dump_numa_node_info(struct common_obj_data *node, void *data); -extern void for_each_numa_node(GList *list, void (*cb)(struct common_obj_data *node, void *data), void *data); -extern void add_package_to_node(struct common_obj_data *p, int nodeid); -extern struct common_obj_data *get_numa_node(int nodeid); +extern void dump_numa_node_info(struct topo_obj *node, void *data); +extern void for_each_numa_node(GList *list, void (*cb)(struct topo_obj *node, void *data), void *data); +extern void add_package_to_node(struct topo_obj *p, int nodeid); +extern struct topo_obj *get_numa_node(int nodeid); /* * Package functions */ #define package_numa_node(p) ((p)->parent) -extern void for_each_package(GList *list, void (*cb)(struct common_obj_data *p, void *data), void *data); +extern void for_each_package(GList *list, void (*cb)(struct topo_obj *p, void *data), void *data); /* * cache_domain functions */ #define cache_domain_package(c) ((c)->parent) #define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) -extern void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); +extern void for_each_cache_domain(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data); /* * cpu core functions @@ -81,8 +81,8 @@ extern void for_each_cache_domain(GList *list, void (*cb)(struct common_obj_data #define cpu_cache_domain(cpu) ((cpu)->parent) #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) -extern void for_each_cpu_core(GList *list, void (*cb)(struct common_obj_data *c, void *data), void *data); -extern struct common_obj_data *find_cpu_core(int cpunr); +extern void for_each_cpu_core(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data); +extern struct topo_obj *find_cpu_core(int cpunr); extern int get_cpu_count(void); /* diff --git a/irqlist.c b/irqlist.c index e5d93ca..55e64b8 100644 --- a/irqlist.c +++ b/irqlist.c @@ -44,7 +44,7 @@ struct load_balance_info { long double std_deviation; }; -static void gather_load_stats(struct common_obj_data *obj, void *data) +static void gather_load_stats(struct topo_obj *obj, void *data) { struct load_balance_info *info = data; @@ -52,7 +52,7 @@ static void gather_load_stats(struct common_obj_data *obj, void *data) info->load_sources += 1; } -static void compute_deviations(struct common_obj_data *obj, void *data) +static void compute_deviations(struct topo_obj *obj, void *data) { struct load_balance_info *info = data; unsigned long long int deviation; @@ -94,7 +94,7 @@ static void move_candidate_irqs(struct irq_info *info, void *data) info->assigned_obj = NULL; } -static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data) +static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) { struct load_balance_info *info = data; int deviation; diff --git a/numa.c b/numa.c index 6afa119..d6b99d0 100644 --- a/numa.c +++ b/numa.c @@ -37,7 +37,7 @@ GList *numa_nodes = NULL; -struct common_obj_data unspecified_node = { +struct topo_obj unspecified_node = { .load = 0, .number = -1, .mask = CPU_MASK_ALL, @@ -49,13 +49,13 @@ struct common_obj_data unspecified_node = { static void add_one_node(const char *nodename) { char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1); - struct common_obj_data *new; + struct topo_obj *new; char *cpustr; FILE *f; if (!path) return; - new = calloc(1, sizeof(struct common_obj_data)); + new = calloc(1, sizeof(struct topo_obj)); if (!new) return; sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); @@ -104,15 +104,15 @@ void free_numa_node_list(void) static gint compare_node(gconstpointer a, gconstpointer b) { - const struct common_obj_data *ai = a; - const struct common_obj_data *bi = b; + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; return (ai->number == bi->number) ? 0 : 1; } -void add_package_to_node(struct common_obj_data *p, int nodeid) +void add_package_to_node(struct topo_obj *p, int nodeid) { - struct common_obj_data find, *node; + struct topo_obj find, *node; find.number = nodeid; GList *entry; @@ -131,7 +131,7 @@ void add_package_to_node(struct common_obj_data *p, int nodeid) p->parent = node; } -void dump_numa_node_info(struct common_obj_data *d, void *unused __attribute__((unused))) +void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused))) { char buffer[4096]; @@ -141,7 +141,7 @@ void dump_numa_node_info(struct common_obj_data *d, void *unused __attribute__(( printf("\n"); } -void for_each_numa_node(GList *list, void(*cb)(struct common_obj_data *node, void *data), void *data) +void for_each_numa_node(GList *list, void(*cb)(struct topo_obj *node, void *data), void *data) { GList *entry, *next; @@ -154,9 +154,9 @@ void for_each_numa_node(GList *list, void(*cb)(struct common_obj_data *node, voi } } -struct common_obj_data *get_numa_node(int nodeid) +struct topo_obj *get_numa_node(int nodeid) { - struct common_obj_data find; + struct topo_obj find; GList *entry; if (nodeid == -1) diff --git a/placement.c b/placement.c index 6680456..a2b537f 100644 --- a/placement.c +++ b/placement.c @@ -33,13 +33,13 @@ int power_mode; GList *rebalance_irq_list; struct obj_placement { - struct common_obj_data *best; - struct common_obj_data *least_irqs; + struct topo_obj *best; + struct topo_obj *least_irqs; uint64_t best_cost; struct irq_info *info; }; -static void find_best_object(struct common_obj_data *d, void *data) +static void find_best_object(struct topo_obj *d, void *data) { struct obj_placement *best = (struct obj_placement *)data; uint64_t newload; @@ -73,9 +73,9 @@ static void find_best_object(struct common_obj_data *d, void *data) static void place_irq_in_cache_domain(struct irq_info *info, void *data) { - struct common_obj_data *p = data; + struct topo_obj *p = data; struct obj_placement place; - struct common_obj_data *asign; + struct topo_obj *asign; if (!info->moved) return; @@ -100,7 +100,7 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) } -static void place_cache_domain(struct common_obj_data *d, void *data __attribute__((unused))) +static void place_cache_domain(struct topo_obj *d, void *data __attribute__((unused))) { if (d->interrupts) for_each_irq(d->interrupts, place_irq_in_cache_domain, d); @@ -108,9 +108,9 @@ static void place_cache_domain(struct common_obj_data *d, void *data __attribute static void place_core(struct irq_info *info, void *data) { - struct common_obj_data *c = data; + struct topo_obj *c = data; struct obj_placement place; - struct common_obj_data *asign; + struct topo_obj *asign; if (!info->moved) return; @@ -136,7 +136,7 @@ static void place_core(struct irq_info *info, void *data) } -static void place_cores(struct common_obj_data *d, void *data __attribute__((unused))) +static void place_cores(struct topo_obj *d, void *data __attribute__((unused))) { if (d->interrupts) for_each_irq(d->interrupts, place_core, d); @@ -145,8 +145,8 @@ static void place_cores(struct common_obj_data *d, void *data __attribute__((unu static void place_irq_in_package(struct irq_info *info, void *data) { struct obj_placement place; - struct common_obj_data *n = data; - struct common_obj_data *asign; + struct topo_obj *n = data; + struct topo_obj *asign; if (!info->moved) return; @@ -170,7 +170,7 @@ static void place_irq_in_package(struct irq_info *info, void *data) } } -static void place_packages(struct common_obj_data *d, void *data __attribute__((unused))) +static void place_packages(struct topo_obj *d, void *data __attribute__((unused))) { if (d->interrupts) for_each_irq(d->interrupts, place_irq_in_package, d); @@ -179,7 +179,7 @@ static void place_packages(struct common_obj_data *d, void *data __attribute__(( static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) { struct obj_placement place; - struct common_obj_data *asign; + struct topo_obj *asign; if( info->level == BALANCE_NONE) return; @@ -190,7 +190,7 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u * put it on that node */ migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info); - info->assigned_obj = (struct common_obj_data *)irq_numa_node(info); + info->assigned_obj = (struct topo_obj *)irq_numa_node(info); irq_numa_node(info)->load += info->load + 1; return; } @@ -218,7 +218,7 @@ static void validate_irq(struct irq_info *info, void *data) info->irq, info->assigned_obj, data); } -static void validate_object(struct common_obj_data *d, void *data __attribute__((unused))) +static void validate_object(struct topo_obj *d, void *data __attribute__((unused))) { if (d->interrupts) for_each_irq(d->interrupts, validate_irq, d); diff --git a/procinterrupts.c b/procinterrupts.c index d004c96..3b6e136 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -128,7 +128,7 @@ static void assign_load_slice(struct irq_info *info, void *data) info->load = (info->irq_count - info->last_irq_count) * *load_slice; } -static void compute_irq_load_share(struct common_obj_data *d, void *data __attribute__((unused))) +static void compute_irq_load_share(struct topo_obj *d, void *data __attribute__((unused))) { uint64_t total_irq_counts = 0; uint64_t load_slice; @@ -146,7 +146,7 @@ void parse_proc_stat() char *line = NULL; size_t size = 0; int cpunr, rc, cpucount; - struct common_obj_data *cpu; + struct topo_obj *cpu; int irq_load, softirq_load; file = fopen("/proc/stat", "r"); diff --git a/types.h b/types.h index 0898497..85d99e6 100644 --- a/types.h +++ b/types.h @@ -26,12 +26,12 @@ #define IRQ_TYPE_MSI 1 #define IRQ_TYPE_MSIX 2 -struct common_obj_data { +struct topo_obj { uint64_t load; int number; cpumask_t mask; GList *interrupts; - struct common_obj_data *parent; + struct topo_obj *parent; GList *children; }; @@ -40,14 +40,14 @@ struct irq_info { int class; int type; int level; - struct common_obj_data *numa_node; + struct topo_obj *numa_node; cpumask_t cpumask; cpumask_t affinity_hint; uint64_t irq_count; uint64_t last_irq_count; uint64_t load; int moved; - struct common_obj_data *assigned_obj; + struct topo_obj *assigned_obj; }; #endif From 1a287acc3752f1d9cf904fe52a544cb7daeff1cf Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 11:35:28 -0400 Subject: [PATCH 33/44] Add object type ennumeration to topo map Since we use a common object for our topology now, add some ennumeration so we can tell what type of object wer'e looking at when debugging --- cputree.c | 4 ++++ numa.c | 3 ++- types.h | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cputree.c b/cputree.c index e1e2711..d24e965 100644 --- a/cputree.c +++ b/cputree.c @@ -76,6 +76,7 @@ static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, if (!package) return NULL; package->mask = package_mask; + package->obj_type = OBJ_TYPE_PACKAGE; packages = g_list_append(packages, package); package_count++; } @@ -115,6 +116,7 @@ static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, cache = calloc(sizeof(struct topo_obj), 1); if (!cache) return NULL; + cache->obj_type = OBJ_TYPE_CACHE; cache->mask = cache_mask; cache->number = cache_domain_count; cache_domains = g_list_append(cache_domains, cache); @@ -169,6 +171,8 @@ static void do_one_cpu(char *path) if (!cpu) return; + cpu->obj_type = OBJ_TYPE_CPU; + cpu->number = strtoul(&path[27], NULL, 10); cpu_set(cpu->number, cpu_possible_map); diff --git a/numa.c b/numa.c index d6b99d0..c5f8e54 100644 --- a/numa.c +++ b/numa.c @@ -40,6 +40,7 @@ GList *numa_nodes = NULL; struct topo_obj unspecified_node = { .load = 0, .number = -1, + .obj_type = OBJ_TYPE_NODE, .mask = CPU_MASK_ALL, .interrupts = NULL, .children = NULL, @@ -71,7 +72,7 @@ static void add_one_node(const char *nodename) free(cpustr); } } - + new->obj_type = OBJ_TYPE_NODE; new->number = strtoul(&nodename[4], NULL, 10); numa_nodes = g_list_append(numa_nodes, new); } diff --git a/types.h b/types.h index 85d99e6..3c4cd11 100644 --- a/types.h +++ b/types.h @@ -26,8 +26,16 @@ #define IRQ_TYPE_MSI 1 #define IRQ_TYPE_MSIX 2 +enum obj_type_e { + OBJ_TYPE_CPU, + OBJ_TYPE_CACHE, + OBJ_TYPE_PACKAGE, + OBJ_TYPE_NODE +}; + struct topo_obj { uint64_t load; + enum obj_type_e obj_type; int number; cpumask_t mask; GList *interrupts; From f06001f62f4a2200c379d32e34011ae8571286d0 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 11:47:37 -0400 Subject: [PATCH 34/44] add master list pointer to topo_obj Its convienient to know how many objects of a given type you have without having to know the specific object type. We can get this info with a pointer to a pointer in each topo object assigned to that objects type master list (cpus, cache_domains, packages, numa_nodes) when we build the tree --- cputree.c | 3 +++ numa.c | 2 ++ types.h | 1 + 3 files changed, 6 insertions(+) diff --git a/cputree.c b/cputree.c index d24e965..4a5672c 100644 --- a/cputree.c +++ b/cputree.c @@ -77,6 +77,7 @@ static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, return NULL; package->mask = package_mask; package->obj_type = OBJ_TYPE_PACKAGE; + package->obj_type_list = &packages; packages = g_list_append(packages, package); package_count++; } @@ -119,6 +120,7 @@ static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, cache->obj_type = OBJ_TYPE_CACHE; cache->mask = cache_mask; cache->number = cache_domain_count; + cache->obj_type_list = &cache_domains; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; } @@ -250,6 +252,7 @@ static void do_one_cpu(char *path) cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus); cpus_and(cpu->mask, cpu->mask, unbanned_cpus); + cpu->obj_type_list = &cpus; cpus = g_list_append(cpus, cpu); core_count++; } diff --git a/numa.c b/numa.c index c5f8e54..f6ef8c2 100644 --- a/numa.c +++ b/numa.c @@ -45,6 +45,7 @@ struct topo_obj unspecified_node = { .interrupts = NULL, .children = NULL, .parent = NULL, + .obj_type_list = &numa_nodes, }; static void add_one_node(const char *nodename) @@ -74,6 +75,7 @@ static void add_one_node(const char *nodename) } new->obj_type = OBJ_TYPE_NODE; new->number = strtoul(&nodename[4], NULL, 10); + new->obj_type_list = &numa_nodes; numa_nodes = g_list_append(numa_nodes, new); } diff --git a/types.h b/types.h index 3c4cd11..c26617f 100644 --- a/types.h +++ b/types.h @@ -41,6 +41,7 @@ struct topo_obj { GList *interrupts; struct topo_obj *parent; GList *children; + GList **obj_type_list; }; struct irq_info { From 58885160eef9235c727204704aab710a99f14e2f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 12:00:23 -0400 Subject: [PATCH 35/44] Consolidate for_each_* functions Each topology object used to have its own for_each iterator. Now that we have a common topology object, we can merge all that code --- cputree.c | 51 +++++++----------------------------------------- irqbalance.c | 2 +- irqbalance.h | 29 ++++++++++++++++++++++----- irqlist.c | 14 ++++++------- numa.c | 13 ------------ placement.c | 20 +++++++++---------- procinterrupts.c | 2 +- 7 files changed, 50 insertions(+), 81 deletions(-) diff --git a/cputree.c b/cputree.c index 4a5672c..10b68a8 100644 --- a/cputree.c +++ b/cputree.c @@ -279,7 +279,7 @@ static void dump_cache_domain(struct topo_obj *d, void *data) cpumask_scnprintf(buffer, 4095, d->mask); printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load); if (d->children) - for_each_cpu_core(d->children, dump_topo_obj, NULL); + for_each_object(d->children, dump_topo_obj, NULL); if (d->interrupts) for_each_irq(d->interrupts, dump_irq, (void *)10); } @@ -290,7 +290,7 @@ static void dump_package(struct topo_obj *d, void *data) cpumask_scnprintf(buffer, 4096, d->mask); printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load); if (d->children) - for_each_cache_domain(d->children, dump_cache_domain, buffer); + for_each_object(d->children, dump_cache_domain, buffer); if (d->interrupts) for_each_irq(d->interrupts, dump_irq, (void *)2); } @@ -298,7 +298,7 @@ static void dump_package(struct topo_obj *d, void *data) void dump_tree(void) { char buffer[4096]; - for_each_package(NULL, dump_package, buffer); + for_each_object(packages, dump_package, buffer); } static void clear_cpu_stats(struct topo_obj *d, void *data __attribute__((unused))) @@ -310,19 +310,19 @@ static void clear_cpu_stats(struct topo_obj *d, void *data __attribute__((unused static void clear_cd_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; - for_each_cpu_core(d->children, clear_cpu_stats, NULL); + for_each_object(d->children, clear_cpu_stats, NULL); } static void clear_package_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; - for_each_cache_domain(d->children, clear_cd_stats, NULL); + for_each_object(d->children, clear_cd_stats, NULL); } static void clear_node_stats(struct topo_obj *d, void *data __attribute__((unused))) { d->load = 0; - for_each_package(d->children, clear_package_stats, NULL); + for_each_object(d->children, clear_package_stats, NULL); } static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) @@ -337,7 +337,7 @@ static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unu */ void clear_work_stats(void) { - for_each_numa_node(NULL, clear_node_stats, NULL); + for_each_object(numa_nodes, clear_node_stats, NULL); for_each_irq(NULL, clear_irq_stats, NULL); } @@ -417,43 +417,6 @@ void clear_cpu_tree(void) } - -void for_each_package(GList *list, void (*cb)(struct topo_obj *p, void *data), void *data) -{ - GList *entry = g_list_first(list ? list : packages); - GList *next; - - while (entry) { - next = g_list_next(entry); - cb(entry->data, data); - entry = next; - } -} - -void for_each_cache_domain(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data) -{ - GList *entry = g_list_first(list ? list : cache_domains); - GList *next; - - while (entry) { - next = g_list_next(entry); - cb(entry->data, data); - entry = next; - } -} - -void for_each_cpu_core(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data) -{ - GList *entry = g_list_first(list ? list : cpus); - GList *next; - - while (entry) { - next = g_list_next(entry); - cb(entry->data, data); - entry = next; - } -} - static gint compare_cpus(gconstpointer a, gconstpointer b) { const struct topo_obj *ai = a; diff --git a/irqbalance.c b/irqbalance.c index 645adaa..072a554 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -135,7 +135,7 @@ static void free_object_tree() static void dump_object_tree() { - for_each_numa_node(NULL, dump_numa_node_info, NULL); + for_each_object(numa_nodes, dump_numa_node_info, NULL); } static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) diff --git a/irqbalance.h b/irqbalance.h index fbced39..acf7e0e 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -40,6 +40,15 @@ void clear_cpu_tree(void); void pci_numa_scan(void); /*===================NEW BALANCER FUNCTIONS============================*/ + +/* + * Master topo_obj type lists + */ +extern GList *numa_nodes; +extern GList *packages; +extern GList *cache_domains; +extern GList *cpus; + enum hp_e { HINT_POLICY_IGNORE, HINT_POLICY_SUBSET, @@ -58,7 +67,6 @@ extern enum hp_e hint_policy; extern void build_numa_node_list(void); extern void free_numa_node_list(void); extern void dump_numa_node_info(struct topo_obj *node, void *data); -extern void for_each_numa_node(GList *list, void (*cb)(struct topo_obj *node, void *data), void *data); extern void add_package_to_node(struct topo_obj *p, int nodeid); extern struct topo_obj *get_numa_node(int nodeid); @@ -66,14 +74,12 @@ extern struct topo_obj *get_numa_node(int nodeid); * Package functions */ #define package_numa_node(p) ((p)->parent) -extern void for_each_package(GList *list, void (*cb)(struct topo_obj *p, void *data), void *data); /* * cache_domain functions */ #define cache_domain_package(c) ((c)->parent) #define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) -extern void for_each_cache_domain(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data); /* * cpu core functions @@ -81,7 +87,6 @@ extern void for_each_cache_domain(GList *list, void (*cb)(struct topo_obj *c, vo #define cpu_cache_domain(cpu) ((cpu)->parent) #define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) #define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) -extern void for_each_cpu_core(GList *list, void (*cb)(struct topo_obj *c, void *data), void *data); extern struct topo_obj *find_cpu_core(int cpunr); extern int get_cpu_count(void); @@ -94,8 +99,22 @@ extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *d extern struct irq_info *get_irq_info(int irq); extern void migrate_irq(GList **from, GList **to, struct irq_info *info); extern struct irq_info *add_misc_irq(int irq); - #define irq_numa_node(irq) ((irq)->numa_node) + +/* + * Generic object functions + */ +static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data) +{ + GList *entry, *next; + entry = g_list_first(list); + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + #endif diff --git a/irqlist.c b/irqlist.c index 55e64b8..5a83f4b 100644 --- a/irqlist.c +++ b/irqlist.c @@ -131,23 +131,23 @@ static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) #define find_overloaded_objs(name, info) do {\ int ___load_sources;\ memset(&(info), 0, sizeof(struct load_balance_info));\ - for_each_##name(NULL, gather_load_stats, &(info));\ + for_each_object((name), gather_load_stats, &(info));\ (info).avg_load = (info).total_load / (info).load_sources;\ - for_each_##name(NULL, compute_deviations, &(info));\ + for_each_object((name), compute_deviations, &(info));\ ___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\ (info).std_deviation = (long double)((info).deviations / ___load_sources);\ (info).std_deviation = sqrt((info).std_deviation);\ - for_each_##name(NULL, migrate_overloaded_irqs, &(info));\ + for_each_object((name), migrate_overloaded_irqs, &(info));\ }while(0) void update_migration_status(void) { struct load_balance_info info; - find_overloaded_objs(cpu_core, info); - find_overloaded_objs(cache_domain, info); - find_overloaded_objs(package, info); - find_overloaded_objs(numa_node, info); + find_overloaded_objs(cpus, info); + find_overloaded_objs(cache_domains, info); + find_overloaded_objs(packages, info); + find_overloaded_objs(numa_nodes, info); } diff --git a/numa.c b/numa.c index f6ef8c2..e7f7079 100644 --- a/numa.c +++ b/numa.c @@ -144,19 +144,6 @@ void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused) printf("\n"); } -void for_each_numa_node(GList *list, void(*cb)(struct topo_obj *node, void *data), void *data) -{ - GList *entry, *next; - - entry = g_list_first(list ? list : numa_nodes); - - while (entry) { - next = g_list_next(entry); - cb(entry->data, data); - entry = next; - } -} - struct topo_obj *get_numa_node(int nodeid) { struct topo_obj find; diff --git a/placement.c b/placement.c index a2b537f..cdaf026 100644 --- a/placement.c +++ b/placement.c @@ -89,7 +89,7 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_cache_domain(p->children, find_best_object, &place); + for_each_object(p->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; @@ -124,7 +124,7 @@ static void place_core(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_cpu_core(c->children, find_best_object, &place); + for_each_object(c->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; @@ -159,7 +159,7 @@ static void place_irq_in_package(struct irq_info *info, void *data) place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_package(n->children, find_best_object, &place); + for_each_object(n->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; @@ -200,7 +200,7 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u place.least_irqs = NULL; place.info = info; - for_each_numa_node(NULL, find_best_object, &place); + for_each_object(numa_nodes, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; @@ -226,9 +226,9 @@ static void validate_object(struct topo_obj *d, void *data __attribute__((unused static void validate_object_tree_placement() { - for_each_package(NULL, validate_object, NULL); - for_each_cache_domain(NULL, validate_object, NULL); - for_each_cpu_core(NULL, validate_object, NULL); + for_each_object(packages, validate_object, NULL); + for_each_object(cache_domains, validate_object, NULL); + for_each_object(cpus, validate_object, NULL); } void calculate_placement(void) @@ -239,9 +239,9 @@ void calculate_placement(void) sort_irq_list(&rebalance_irq_list); for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); - for_each_numa_node(NULL, place_packages, NULL); - for_each_package(NULL, place_cache_domain, NULL); - for_each_cache_domain(NULL, place_cores, NULL); + for_each_object(numa_nodes, place_packages, NULL); + for_each_object(packages, place_cache_domain, NULL); + for_each_object(cache_domains, place_cores, NULL); if (debug_mode) validate_object_tree_placement(); diff --git a/procinterrupts.c b/procinterrupts.c index 3b6e136..1f24a72 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -204,6 +204,6 @@ void parse_proc_stat() * Now that we have load for each cpu attribute a fair share of the load * to each irq on that cpu */ - for_each_cpu_core(NULL, compute_irq_load_share, NULL); + for_each_object(cpus, compute_irq_load_share, NULL); } From 4fd799cbd17ea893d06dcb9aa0db58e75afc2495 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 13:21:00 -0400 Subject: [PATCH 36/44] Consolidate duplicated functions We had lots of duplicated functionality with multiple topology types. Now that we have only the single topo_obj structure, we can consolidate lots of code --- cputree.c | 34 ++++------------- placement.c | 108 ++++++++++++++-------------------------------------- 2 files changed, 36 insertions(+), 106 deletions(-) diff --git a/cputree.c b/cputree.c index 10b68a8..79de87c 100644 --- a/cputree.c +++ b/cputree.c @@ -301,35 +301,18 @@ void dump_tree(void) for_each_object(packages, dump_package, buffer); } -static void clear_cpu_stats(struct topo_obj *d, void *data __attribute__((unused))) -{ - struct topo_obj *c = (struct topo_obj *)d; - c->load = 0; -} - -static void clear_cd_stats(struct topo_obj *d, void *data __attribute__((unused))) -{ - d->load = 0; - for_each_object(d->children, clear_cpu_stats, NULL); -} - -static void clear_package_stats(struct topo_obj *d, void *data __attribute__((unused))) -{ - d->load = 0; - for_each_object(d->children, clear_cd_stats, NULL); -} - -static void clear_node_stats(struct topo_obj *d, void *data __attribute__((unused))) -{ - d->load = 0; - for_each_object(d->children, clear_package_stats, NULL); -} - static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) { info->load = 0; } +static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused))) +{ + d->load = 0; + for_each_object(d->children, clear_obj_stats, NULL); + for_each_irq(d->interrupts, clear_irq_stats, NULL); +} + /* * this function removes previous state from the cpu tree, such as * which level does how much work and the actual lists of interrupts @@ -337,8 +320,7 @@ static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unu */ void clear_work_stats(void) { - for_each_object(numa_nodes, clear_node_stats, NULL); - for_each_irq(NULL, clear_irq_stats, NULL); + for_each_object(numa_nodes, clear_obj_stats, NULL); } diff --git a/placement.c b/placement.c index cdaf026..6071601 100644 --- a/placement.c +++ b/placement.c @@ -71,109 +71,57 @@ static void find_best_object(struct topo_obj *d, void *data) } } -static void place_irq_in_cache_domain(struct irq_info *info, void *data) +static void find_best_object_for_irq(struct irq_info *info, void *data) { - struct topo_obj *p = data; struct obj_placement place; + struct topo_obj *d = data; struct topo_obj *asign; if (!info->moved) return; - if (info->level <= BALANCE_PACKAGE) - return; + switch (d->obj_type) { + case OBJ_TYPE_NODE: + if (info->level == BALANCE_NONE) + return; + break; + case OBJ_TYPE_PACKAGE: + if (info->level == BALANCE_PACKAGE) + return; + break; - place.info = info; - place.best = NULL; - place.least_irqs = NULL; - place.best_cost = INT_MAX; + case OBJ_TYPE_CACHE: + if (info->level == BALANCE_CACHE) + return; + break; - for_each_object(p->children, find_best_object, &place); - - asign = place.least_irqs ? place.least_irqs : place.best; - - if (asign) { - migrate_irq(&p->interrupts, &asign->interrupts, info); - info->assigned_obj = asign; + case OBJ_TYPE_CPU: + if (info->level == BALANCE_CORE) + return; + break; } -} - -static void place_cache_domain(struct topo_obj *d, void *data __attribute__((unused))) -{ - if (d->interrupts) - for_each_irq(d->interrupts, place_irq_in_cache_domain, d); -} - -static void place_core(struct irq_info *info, void *data) -{ - struct topo_obj *c = data; - struct obj_placement place; - struct topo_obj *asign; - - if (!info->moved) - return; - - if ((info->level <= BALANCE_CACHE) && - (!one_shot_mode)) - return; - place.info = info; place.best = NULL; place.least_irqs = NULL; place.best_cost = INT_MAX; - for_each_object(c->children, find_best_object, &place); + for_each_object(d->children, find_best_object, &place); asign = place.least_irqs ? place.least_irqs : place.best; if (asign) { - migrate_irq(&c->interrupts, &asign->interrupts, info); - info->assigned_obj = asign; - asign->load += info->load; - } - -} - -static void place_cores(struct topo_obj *d, void *data __attribute__((unused))) -{ - if (d->interrupts) - for_each_irq(d->interrupts, place_core, d); -} - -static void place_irq_in_package(struct irq_info *info, void *data) -{ - struct obj_placement place; - struct topo_obj *n = data; - struct topo_obj *asign; - - if (!info->moved) - return; - - if (info->level == BALANCE_NONE) - return; - - place.info = info; - place.best = NULL; - place.least_irqs = NULL; - place.best_cost = INT_MAX; - - for_each_object(n->children, find_best_object, &place); - - asign = place.least_irqs ? place.least_irqs : place.best; - - if (asign) { - migrate_irq(&n->interrupts, &asign->interrupts, info); + migrate_irq(&d->interrupts, &asign->interrupts, info); info->assigned_obj = asign; asign->load += info->load; } } -static void place_packages(struct topo_obj *d, void *data __attribute__((unused))) +static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused))) { - if (d->interrupts) - for_each_irq(d->interrupts, place_irq_in_package, d); + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, find_best_object_for_irq, d); } static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) @@ -190,7 +138,7 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u * put it on that node */ migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info); - info->assigned_obj = (struct topo_obj *)irq_numa_node(info); + info->assigned_obj = irq_numa_node(info); irq_numa_node(info)->load += info->load + 1; return; } @@ -239,9 +187,9 @@ void calculate_placement(void) sort_irq_list(&rebalance_irq_list); for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); - for_each_object(numa_nodes, place_packages, NULL); - for_each_object(packages, place_cache_domain, NULL); - for_each_object(cache_domains, place_cores, NULL); + for_each_object(numa_nodes, place_irq_in_object, NULL); + for_each_object(packages, place_irq_in_object, NULL); + for_each_object(cache_domains, place_irq_in_object, NULL); if (debug_mode) validate_object_tree_placement(); From 38b3bb825d7644280beaed78a6e1b4c414f65492 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 15:04:26 -0400 Subject: [PATCH 37/44] Adjusting load computation to be fair at all levels Previuosly we treated the all load as originating from a single cpu. Thats not true in the event an interrupt is assigned to a higher level object, like a cache domain. This change spreads the load out, attempting to use a hureistic whereby we assume that a fair share of interrupts from parent objects is handled by this cpu, so we only attribute a fraction of the total shared load to each single cpu. This allows us to come up with a reasonable load value for interrupts assigned to cache domains, packages, nodes, etc Signed-off-by: Neil Horman --- classify.c | 11 ++++------- irqbalance.c | 7 +++++-- placement.c | 15 ++++++--------- procinterrupts.c | 45 +++++++++++++++++++++++++++++++++++++-------- 4 files changed, 52 insertions(+), 26 deletions(-) diff --git a/classify.c b/classify.c index 49212d4..6a20480 100644 --- a/classify.c +++ b/classify.c @@ -313,13 +313,10 @@ void migrate_irq(GList **from, GList **to, struct irq_info *info) GList *entry; struct irq_info find, *tmp;; - if (from != NULL) { - find.irq = info->irq; - entry = g_list_find_custom(*from, &find, compare_ints); - tmp = entry->data; - *from = g_list_delete_link(*from, entry); - } else - tmp = info; + find.irq = info->irq; + entry = g_list_find_custom(*from, &find, compare_ints); + tmp = entry->data; + *from = g_list_delete_link(*from, entry); *to = g_list_append(*to, tmp); diff --git a/irqbalance.c b/irqbalance.c index 072a554..c229b9d 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -143,8 +143,11 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__( if (info->level == BALANCE_NONE) return; - migrate_irq((info->assigned_obj ? &info->assigned_obj->interrupts : NULL), - &rebalance_irq_list, info); + if (info->assigned_obj == NULL) + rebalance_irq_list = g_list_append(rebalance_irq_list, info); + else + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + info->assigned_obj = NULL; } diff --git a/placement.c b/placement.c index 6071601..0e0a998 100644 --- a/placement.c +++ b/placement.c @@ -181,16 +181,13 @@ static void validate_object_tree_placement() void calculate_placement(void) { - /* first clear old data */ - clear_work_stats(); - sort_irq_list(&rebalance_irq_list); - - for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); - for_each_object(numa_nodes, place_irq_in_object, NULL); - for_each_object(packages, place_irq_in_object, NULL); - for_each_object(cache_domains, place_irq_in_object, NULL); - + if (g_list_length(rebalance_irq_list) > 0) { + for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); + for_each_object(numa_nodes, place_irq_in_object, NULL); + for_each_object(packages, place_irq_in_object, NULL); + for_each_object(cache_domains, place_irq_in_object, NULL); + } if (debug_mode) validate_object_tree_placement(); } diff --git a/procinterrupts.c b/procinterrupts.c index 1f24a72..d5555f2 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -128,16 +128,45 @@ static void assign_load_slice(struct irq_info *info, void *data) info->load = (info->irq_count - info->last_irq_count) * *load_slice; } -static void compute_irq_load_share(struct topo_obj *d, void *data __attribute__((unused))) +/* + * Recursive helper to estimate the number of irqs shared between + * multiple topology objects that was handled by this particular object + */ +static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d) +{ + uint64_t total_irq_count = 0; + + if (d->parent) { + total_irq_count = get_parent_branch_irq_count_share(d->parent); + total_irq_count /= g_list_length(*d->obj_type_list); + } + + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_count); + + return total_irq_count; +} + +static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused))) { uint64_t total_irq_counts = 0; + uint64_t local_irq_counts = 0; + uint64_t load_slice; - for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_counts); + total_irq_counts = get_parent_branch_irq_count_share(d); - load_slice = total_irq_counts ? (d->load / total_irq_counts) : 1; + load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1; - for_each_irq(d->interrupts, assign_load_slice, &load_slice); + if (g_list_length(d->interrupts) > 0) { + for_each_irq(d->interrupts, accumulate_irq_count, &local_irq_counts); + for_each_irq(d->interrupts, assign_load_slice, &load_slice); + } + + if (d->parent) { + load_slice = total_irq_counts ? (d->load / total_irq_counts) : 1; + d->parent->load += (total_irq_counts - local_irq_counts) * load_slice; + } } void parse_proc_stat() @@ -189,9 +218,6 @@ void parse_proc_stat() * all the way up the device tree */ cpu->load = irq_load + softirq_load; - cpu_cache_domain(cpu)->load += cpu->load; - cpu_package(cpu)->load += cpu->load; - cpu_numa_node(cpu)->load += cpu->load; } fclose(file); @@ -204,6 +230,9 @@ void parse_proc_stat() * Now that we have load for each cpu attribute a fair share of the load * to each irq on that cpu */ - for_each_object(cpus, compute_irq_load_share, NULL); + for_each_object(cpus, compute_irq_branch_load_share, NULL); + for_each_object(cache_domains, compute_irq_branch_load_share, NULL); + for_each_object(packages, compute_irq_branch_load_share, NULL); + for_each_object(numa_nodes, compute_irq_branch_load_share, NULL); } From e2f6588bd41a3738cf7e2800b2cdfbb02289b44f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 15:59:15 -0400 Subject: [PATCH 38/44] Add powersave settings Add an optional heuristic to allow cpus to not service interrupts during periods of low activity, to help power conservation. If more than power_thresh cpus are more then a standard deviation below the average load, and no cpus are overloaded by more than a standard deviation and have more than one irq on them, then we stop balancing to a single cpu. If at any time we have a cpu go over a standard deviation of load, we re-enable all the cpus for balancing --- irqbalance.c | 32 ++++++++++++++++++++------------ irqbalance.h | 2 ++ irqlist.c | 34 ++++++++++++++++++++++++++++++++-- placement.c | 3 +++ powermode.c | 47 ----------------------------------------------- types.h | 1 + 6 files changed, 58 insertions(+), 61 deletions(-) diff --git a/irqbalance.c b/irqbalance.c index c229b9d..c87953c 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -40,9 +40,9 @@ int debug_mode; int numa_avail; int need_cpu_rescan; extern cpumask_t banned_cpus; -static int counter; enum hp_e hint_policy = HINT_POLICY_SUBSET; - +unsigned long power_thresh = ULONG_MAX; +unsigned long long cycle_count = 0; void sleep_approx(int seconds) { @@ -63,12 +63,14 @@ struct option lopts[] = { {"oneshot", 0, NULL, 'o'}, {"debug", 0, NULL, 'd'}, {"hintpolicy", 1, NULL, 'h'}, + {"powerthresh", 1, NULL, 'p'}, {0, 0, 0, 0} }; static void usage(void) { - printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]"); + printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]\n"); + printf(" [--powerthresh= | -p | ]\n"); } static void parse_command_line(int argc, char **argv) @@ -77,7 +79,7 @@ static void parse_command_line(int argc, char **argv) int longind; while ((opt = getopt_long(argc, argv, - "odh:", + "odh:p:", lopts, &longind)) != -1) { switch(opt) { @@ -99,6 +101,17 @@ static void parse_command_line(int argc, char **argv) exit(1); } break; + case 'p': + if (!strncmp(optarg, "off", strlen(optarg))) + power_thresh = ULONG_MAX; + else { + power_thresh = strtoull(optarg, NULL, 10); + if (power_thresh == ULONG_MAX) { + usage(); + exit(1); + } + } + break; case 'o': one_shot_mode=1; break; @@ -153,7 +166,6 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__( int main(int argc, char** argv) { - int compute_migration_status=0; #ifdef HAVE_GETOPT_LONG parse_command_line(argc, argv); @@ -214,7 +226,6 @@ int main(int argc, char** argv) printf("\n\n\n-----------------------------------------------------------------------------\n"); - check_power_mode(); parse_proc_interrupts(); parse_proc_stat(); @@ -231,14 +242,11 @@ int main(int argc, char** argv) free_object_tree(); build_object_tree(); for_each_irq(NULL, force_rebalance_irq, NULL); - compute_migration_status=0; + cycle_count=0; } - if (compute_migration_status) + if (cycle_count) update_migration_status(); - else - compute_migration_status=1; - calculate_placement(); activate_mappings(); @@ -248,7 +256,7 @@ int main(int argc, char** argv) if (one_shot_mode) break; clear_work_stats(); - counter++; + cycle_count++; } free_object_tree(); diff --git a/irqbalance.h b/irqbalance.h index acf7e0e..8849d0e 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -60,6 +60,8 @@ extern int one_shot_mode; extern int power_mode; extern int need_cpu_rescan; extern enum hp_e hint_policy; +extern unsigned long long cycle_count; +extern unsigned long power_thresh; /* * Numa node access routines diff --git a/irqlist.c b/irqlist.c index 5a83f4b..d57049a 100644 --- a/irqlist.c +++ b/irqlist.c @@ -42,6 +42,10 @@ struct load_balance_info { int load_sources; unsigned long long int deviations; long double std_deviation; + unsigned int num_within; + unsigned int num_over; + unsigned int num_under; + struct topo_obj *powersave; }; static void gather_load_stats(struct topo_obj *obj, void *data) @@ -102,13 +106,21 @@ static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) /* * Don't rebalance irqs on objects whos load is below the average */ - if (obj->load <= info->avg_load) + if (obj->load <= info->avg_load) { + if ((obj->load + info->std_deviation) <= info->avg_load) { + info->num_under++; + info->powersave = obj; + } else + info->num_within++; return; + } deviation = obj->load - info->avg_load; if ((deviation > info->std_deviation) && (g_list_length(obj->interrupts) > 1)) { + + info->num_over++; /* * We have a cpu that is overloaded and * has irqs that can be moved to fix that @@ -124,10 +136,21 @@ static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) * difference reaches zero */ for_each_irq(obj->interrupts, move_candidate_irqs, &deviation); - } + } else + info->num_within++; } +static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused))) +{ + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); +} + +static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused))) +{ + obj->powersave_mode = 0; +} + #define find_overloaded_objs(name, info) do {\ int ___load_sources;\ memset(&(info), 0, sizeof(struct load_balance_info));\ @@ -145,6 +168,13 @@ void update_migration_status(void) struct load_balance_info info; find_overloaded_objs(cpus, info); + if (cycle_count > 5) { + if (!info.num_over && (info.num_under >= power_thresh)) { + info.powersave->powersave_mode = 1; + for_each_irq(info.powersave->interrupts, force_irq_migration, NULL); + } else if (info.num_over) + for_each_object(cpus, clear_powersave_mode, NULL); + } find_overloaded_objs(cache_domains, info); find_overloaded_objs(packages, info); find_overloaded_objs(numa_nodes, info); diff --git a/placement.c b/placement.c index 0e0a998..cfa419e 100644 --- a/placement.c +++ b/placement.c @@ -58,6 +58,9 @@ static void find_best_object(struct topo_obj *d, void *data) } } + if (d->powersave_mode) + return; + newload = d->load; if (newload < best->best_cost) { best->best = d; diff --git a/powermode.c b/powermode.c index 809cae8..82ba490 100644 --- a/powermode.c +++ b/powermode.c @@ -28,54 +28,7 @@ #include "irqbalance.h" -extern int power_mode; - -static uint64_t previous; - -static unsigned int hysteresis; - void check_power_mode(void) { - FILE *file; - char *line = NULL; - size_t size = 0; - char *c; - uint64_t dummy __attribute__((unused)); - uint64_t irq, softirq; - file = fopen("/proc/stat", "r"); - if (!file) - return; - if (getline(&line, &size, file)==0) - size=0; - fclose(file); - if (!line) - return; - c=&line[4]; - dummy = strtoull(c, &c, 10); /* user */ - dummy = strtoull(c, &c, 10); /* nice */ - dummy = strtoull(c, &c, 10); /* system */ - dummy = strtoull(c, &c, 10); /* idle */ - dummy = strtoull(c, &c, 10); /* iowait */ - irq = strtoull(c, &c, 10); /* irq */ - softirq = strtoull(c, &c, 10); /* softirq */ - - - irq += softirq; - printf("IRQ delta is %lu \n", (unsigned long)(irq - previous) ); - if (irq - previous < POWER_MODE_SOFTIRQ_THRESHOLD) { - hysteresis++; - if (hysteresis > POWER_MODE_HYSTERESIS) { - if (debug_mode && !power_mode) - printf("IRQ delta is %lu, switching to power mode \n", (unsigned long)(irq - previous) ); - power_mode = 1; - } - } else { - if (debug_mode && power_mode) - printf("IRQ delta is %lu, switching to performance mode \n", (unsigned long)(irq - previous) ); - power_mode = 0; - hysteresis = 0; - } - previous = irq; - free(line); } diff --git a/types.h b/types.h index c26617f..1fd2be9 100644 --- a/types.h +++ b/types.h @@ -37,6 +37,7 @@ struct topo_obj { uint64_t load; enum obj_type_e obj_type; int number; + int powersave_mode; cpumask_t mask; GList *interrupts; struct topo_obj *parent; From 327691a79b322c849b5cdcdeec30bbea569646c5 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 10 Oct 2011 16:05:16 -0400 Subject: [PATCH 39/44] Adding Syslog notification for powersave add some syslog messages to indicate when powersaving is taking place --- irqbalance.h | 1 + irqlist.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/irqbalance.h b/irqbalance.h index 8849d0e..a1b1e8a 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -8,6 +8,7 @@ #include #include +#include #include "types.h" #include diff --git a/irqlist.c b/irqlist.c index d57049a..fba2696 100644 --- a/irqlist.c +++ b/irqlist.c @@ -170,10 +170,13 @@ void update_migration_status(void) find_overloaded_objs(cpus, info); if (cycle_count > 5) { if (!info.num_over && (info.num_under >= power_thresh)) { + syslog(LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number); info.powersave->powersave_mode = 1; for_each_irq(info.powersave->interrupts, force_irq_migration, NULL); - } else if (info.num_over) + } else if (info.num_over) { + syslog(LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n"); for_each_object(cpus, clear_powersave_mode, NULL); + } } find_overloaded_objs(cache_domains, info); find_overloaded_objs(packages, info); From c4b8403e2346e5fc67508f223c943c2dad074c41 Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Tue, 11 Oct 2011 11:17:51 -0400 Subject: [PATCH 40/44] proc/interrupts needs more permissive parsing When there was more than one leading spaces in the beginning of line, irq database was forced to rebuild. Signed-off-by: Petr Holasek Signed-offy-by: Neil Horman --- procinterrupts.c | 6 +++++- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/procinterrupts.c b/procinterrupts.c index d5555f2..2c1bb25 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "cpumask.h" #include "irqbalance.h" @@ -66,7 +67,10 @@ void parse_proc_interrupts(void) proc_int_has_msi = 1; /* lines with letters in front are special, like NMI count. Ignore */ - if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9'))) + c = line; + while (isblank(*(c++))) + ; + if (!(*c>='0' && *c<='9')) break; c = strchr(line, ':'); if (!c) -- 1.7.6.4 --- procinterrupts.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/procinterrupts.c b/procinterrupts.c index d5555f2..2c1bb25 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "cpumask.h" #include "irqbalance.h" @@ -66,7 +67,10 @@ void parse_proc_interrupts(void) proc_int_has_msi = 1; /* lines with letters in front are special, like NMI count. Ignore */ - if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9'))) + c = line; + while (isblank(*(c++))) + ; + if (!(*c>='0' && *c<='9')) break; c = strchr(line, ':'); if (!c) From cf4a1d8d4d122c347cf5e7d7e7178451b052d694 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 11 Oct 2011 11:26:58 -0400 Subject: [PATCH 41/44] Remove extraneous header file checks ethtool.h and sockios.h are no longer needed after the re-work Signed-off-by: Neil Horman Suggested-by: Petr Holasek --- configure.ac | 1 - 1 file changed, 1 deletion(-) diff --git a/configure.ac b/configure.ac index fd0385f..b490cd0 100644 --- a/configure.ac +++ b/configure.ac @@ -51,7 +51,6 @@ AC_PROG_AWK echo . echo Checking for header files AC_HEADER_STDC -AC_CHECK_HEADERS(linux/ethtool.h linux/sockios.h, [], []) AC_CHECK_FUNCS(getopt_long) From 5ebd1b24a934767fef62d4cfa643b1191c699a23 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 11 Oct 2011 11:54:48 -0400 Subject: [PATCH 42/44] fix up iterator in parse_proc_interrupts There was a problem with the iterator loop to skip leading whitespace in parse_proc_interrupts - it skipped too far, this brings it back --- procinterrupts.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/procinterrupts.c b/procinterrupts.c index 2c1bb25..ee061dc 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -68,8 +68,9 @@ void parse_proc_interrupts(void) /* lines with letters in front are special, like NMI count. Ignore */ c = line; - while (isblank(*(c++))) - ; + while (isblank(*(c))) + c++; + if (!(*c>='0' && *c<='9')) break; c = strchr(line, ':'); From 34ac21b1a71d5dcaa3c014d5dc009d610cba352e Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 11 Oct 2011 14:19:21 -0400 Subject: [PATCH 43/44] Fix up load estimator Found a few bugs in the load estimator - we're were attributing load to multiple irqs errneously, and in the process of fixing that I found that we had a bogus topology map - the same package was getting added multiple times to a given numa node, since we didn't already detect that was part of that nodes child list. --- cputree.c | 3 +-- irqbalance.c | 11 ++++++++++- numa.c | 6 ++++-- procinterrupts.c | 17 +++++++---------- types.h | 1 + 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/cputree.c b/cputree.c index 79de87c..cc6e077 100644 --- a/cputree.c +++ b/cputree.c @@ -308,7 +308,6 @@ static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unu static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused))) { - d->load = 0; for_each_object(d->children, clear_obj_stats, NULL); for_each_irq(d->interrupts, clear_irq_stats, NULL); } @@ -318,7 +317,7 @@ static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused * which level does how much work and the actual lists of interrupts * assigned to each component */ -void clear_work_stats(void) +void clear_work_stats() { for_each_object(numa_nodes, clear_obj_stats, NULL); } diff --git a/irqbalance.c b/irqbalance.c index c87953c..5bab859 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -220,12 +220,16 @@ int main(int argc, char** argv) for_each_irq(NULL, force_rebalance_irq, NULL); + parse_proc_interrupts(); + parse_proc_stat(); + while (1) { sleep_approx(SLEEP_INTERVAL); if (debug_mode) printf("\n\n\n-----------------------------------------------------------------------------\n"); + clear_work_stats(); parse_proc_interrupts(); parse_proc_stat(); @@ -242,6 +246,12 @@ int main(int argc, char** argv) free_object_tree(); build_object_tree(); for_each_irq(NULL, force_rebalance_irq, NULL); + parse_proc_interrupts(); + parse_proc_stat(); + sleep_approx(SLEEP_INTERVAL); + clear_work_stats(); + parse_proc_interrupts(); + parse_proc_stat(); cycle_count=0; } @@ -255,7 +265,6 @@ int main(int argc, char** argv) dump_tree(); if (one_shot_mode) break; - clear_work_stats(); cycle_count++; } diff --git a/numa.c b/numa.c index e7f7079..6e00243 100644 --- a/numa.c +++ b/numa.c @@ -130,8 +130,10 @@ void add_package_to_node(struct topo_obj *p, int nodeid) node = entry->data; - node->children = g_list_append(node->children, p); - p->parent = node; + if (!p->parent) { + node->children = g_list_append(node->children, p); + p->parent = node; + } } void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused))) diff --git a/procinterrupts.c b/procinterrupts.c index ee061dc..cbcb1d1 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -154,24 +154,19 @@ static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d) static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused))) { - uint64_t total_irq_counts = 0; uint64_t local_irq_counts = 0; uint64_t load_slice; - total_irq_counts = get_parent_branch_irq_count_share(d); - - load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1; if (g_list_length(d->interrupts) > 0) { - for_each_irq(d->interrupts, accumulate_irq_count, &local_irq_counts); + local_irq_counts = get_parent_branch_irq_count_share(d); + load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1; for_each_irq(d->interrupts, assign_load_slice, &load_slice); } - if (d->parent) { - load_slice = total_irq_counts ? (d->load / total_irq_counts) : 1; - d->parent->load += (total_irq_counts - local_irq_counts) * load_slice; - } + if (d->parent) + d->parent->load += d->load; } void parse_proc_stat() @@ -222,7 +217,9 @@ void parse_proc_stat() * For each cpu add the irq and softirq load and propagate that * all the way up the device tree */ - cpu->load = irq_load + softirq_load; + if (cycle_count) + cpu->load = (irq_load + softirq_load) - (cpu->last_load); + cpu->last_load = (irq_load + softirq_load); } fclose(file); diff --git a/types.h b/types.h index 1fd2be9..e9a2fcd 100644 --- a/types.h +++ b/types.h @@ -35,6 +35,7 @@ enum obj_type_e { struct topo_obj { uint64_t load; + uint64_t last_load; enum obj_type_e obj_type; int number; int powersave_mode; From ac6240a8bc7e57c857c1f53c9af459b9a5b90090 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 11 Oct 2011 16:06:06 -0400 Subject: [PATCH 44/44] Fine tune load computation algorithm CPU stats display in terms of jiffies. Multiply that by 1000 to get us to units of 1/10th milliseconds so that large interrupt sources don't wind up reducing their load share to 0. Also, ensure that all interrupt sources have a load of at least one (no one is a free ride) --- irqlist.c | 3 +++ procinterrupts.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/irqlist.c b/irqlist.c index fba2696..c82131a 100644 --- a/irqlist.c +++ b/irqlist.c @@ -93,6 +93,9 @@ static void move_candidate_irqs(struct irq_info *info, void *data) *remaining_deviation -= info->load; + if (debug_mode) + printf("Selecting irq %d for rebalancing\n", info->irq); + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); info->assigned_obj = NULL; diff --git a/procinterrupts.c b/procinterrupts.c index cbcb1d1..ebaea5e 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -131,6 +131,12 @@ static void assign_load_slice(struct irq_info *info, void *data) { uint64_t *load_slice = data; info->load = (info->irq_count - info->last_irq_count) * *load_slice; + + /* + * Every IRQ has at least a load of 1 + */ + if (!info->load) + info->load++; } /* @@ -155,9 +161,10 @@ static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d) static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused))) { uint64_t local_irq_counts = 0; - uint64_t load_slice; + int load_divisor = g_list_length(d->children); + d->load /= (load_divisor ? load_divisor : 1); if (g_list_length(d->interrupts) > 0) { local_irq_counts = get_parent_branch_irq_count_share(d); @@ -217,8 +224,16 @@ void parse_proc_stat() * For each cpu add the irq and softirq load and propagate that * all the way up the device tree */ - if (cycle_count) + if (cycle_count) { cpu->load = (irq_load + softirq_load) - (cpu->last_load); + /* + * the [soft]irq_load values are in jiffies, which are + * units of 10ms, multiply by 1000 to convert that to + * 1/10 milliseconds. This give us a better integer + * distribution of load between irqs + */ + cpu->load *= 1000; + } cpu->last_load = (irq_load + softirq_load); }