diff --git a/Makefile.am b/Makefile.am index 5624af4..3d37155 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4 INCLUDES = -I${top_srcdir} -LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma +LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma -lm AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \ types.h diff --git a/activate.c b/activate.c index 68c142c..292c44a 100644 --- a/activate.c +++ b/activate.c @@ -32,30 +32,40 @@ #include "irqbalance.h" -void activate_mapping(void) +static void activate_mapping(struct irq_info *info, void *data __attribute__((unused))) { - struct interrupt *irq; - GList *iter; + char buf[PATH_MAX]; + FILE *file; + cpumask_t applied_mask; - iter = g_list_first(interrupts); - while (iter) { - irq = iter->data; - iter = g_list_next(iter); + /* + * only activate mappings for irqs that have moved + */ + if (!info->moved) + return; - /* don't set the level if it's a NONE irq, or if there is - * no change */ - if (irq->balance_level != BALANCE_NONE && - !cpus_equal(irq->mask, irq->old_mask)) { - char buf[PATH_MAX]; - FILE *file; - sprintf(buf, "/proc/irq/%i/smp_affinity", irq->number); - file = fopen(buf, "w"); - if (!file) - continue; - cpumask_scnprintf(buf, PATH_MAX, irq->mask); - fprintf(file,"%s", buf); - fclose(file); - irq->old_mask = irq->mask; - } - } + if (!info->assigned_obj) + return; + + + sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); + file = fopen(buf, "w"); + if (!file) + return; + + if ((hint_policy == HINT_POLICY_EXACT) && + (!cpus_empty(info->affinity_hint))) + applied_mask = info->affinity_hint; + else + applied_mask = info->assigned_obj->mask; + + cpumask_scnprintf(buf, PATH_MAX, applied_mask); + fprintf(file, "%s", buf); + fclose(file); + info->moved = 0; /*migration is done*/ +} + +void activate_mappings(void) +{ + for_each_irq(NULL, activate_mapping, NULL); } diff --git a/classify.c b/classify.c index 957dd34..6a20480 100644 --- a/classify.c +++ b/classify.c @@ -25,8 +25,6 @@ int map_class_to_level[7] = { BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CACHE, BALANCE_NONE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE }; -int class_counts[7]; - #define MAX_CLASS 0x12 /* * Class codes lifted from pci spec, appendix D. @@ -56,35 +54,6 @@ static short class_codes[MAX_CLASS] = { static GList *interrupts_db; #define SYSDEV_DIR "/sys/bus/pci/devices" -union property { - int int_val; - cpumask_t mask_val; -}; - -enum irq_type { - INT_TYPE = 0, - CPUMASK_TYPE, -}; - -struct irq_property { - enum irq_type itype; - union property iproperty; -}; -#define iint_val iproperty.int_val -#define imask_val iproperty.mask_val - -struct irq_info { - int irq; - struct irq_property property[IRQ_MAX_PROPERTY]; -}; - -static void init_new_irq(struct irq_info *new) -{ - new->property[IRQ_CLASS].itype = INT_TYPE; - new->property[IRQ_TYPE].itype = INT_TYPE; - new->property[IRQ_NUMA].itype = INT_TYPE; - new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE; -} static gint compare_ints(gconstpointer a, gconstpointer b) { @@ -94,11 +63,6 @@ static gint compare_ints(gconstpointer a, gconstpointer b) return ai->irq - bi->irq; } -static void free_int(gpointer data) -{ - free(data); -} - /* * Inserts an irq_info struct into the intterupts_db list * devpath points to the device directory in sysfs for the @@ -126,13 +90,12 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq) return NULL; } - new = malloc(sizeof(struct irq_info)); + new = calloc(sizeof(struct irq_info), 1); if (!new) return NULL; - init_new_irq(new); new->irq = irq; - new->property[IRQ_CLASS].iint_val = IRQ_OTHER; + new->class = IRQ_OTHER; interrupts_db = g_list_append(interrupts_db, new); @@ -159,7 +122,9 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq) if (class >= MAX_CLASS) goto get_numa_node; - new->property[IRQ_CLASS].iint_val = class_codes[class]; + new->class = class_codes[class]; + new->level = map_class_to_level[class_codes[class]]; + get_numa_node: numa_node = -1; sprintf(path, "%s/numa_node", devpath); @@ -171,24 +136,39 @@ get_numa_node: fclose(fd); assign_node: - new->property[IRQ_NUMA].iint_val = numa_node; + new->numa_node = get_numa_node(numa_node); sprintf(path, "%s/local_cpus", devpath); fd = fopen(path, "r"); if (!fd) { - cpus_setall(new->property[IRQ_LCPU_MASK].imask_val); - goto out; + cpus_setall(new->cpumask); + goto assign_affinity_hint; } lcpu_mask = NULL; rc = fscanf(fd, "%as", &lcpu_mask); fclose(fd); - if (!lcpu_mask) { - cpus_setall(new->property[IRQ_LCPU_MASK].imask_val); + if (!lcpu_mask || !rc) { + cpus_setall(new->cpumask); } else { cpumask_parse_user(lcpu_mask, strlen(lcpu_mask), - new->property[IRQ_LCPU_MASK].imask_val); - free(lcpu_mask); + new->cpumask); } + free(lcpu_mask); + +assign_affinity_hint: + cpus_clear(new->affinity_hint); + sprintf(path, "/proc/irq/%d/affinity_hint", irq); + fd = fopen(path, "r"); + if (!fd) + goto out; + lcpu_mask = NULL; + rc = fscanf(fd, "%as", &lcpu_mask); + fclose(fd); + if (!lcpu_mask) + goto out; + cpumask_parse_user(lcpu_mask, strlen(lcpu_mask), + new->affinity_hint); + free(lcpu_mask); out: if (debug_mode) printf("Adding IRQ %d to database\n", irq); @@ -226,7 +206,7 @@ static void build_one_dev_entry(const char *dirname) new = add_one_irq_to_db(path, irqnum); if (!new) continue; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX; + new->type = IRQ_TYPE_MSIX; } } while (entry != NULL); closedir(msidir); @@ -248,20 +228,32 @@ static void build_one_dev_entry(const char *dirname) new = add_one_irq_to_db(path, irqnum); if (!new) goto done; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY; + new->type = IRQ_TYPE_LEGACY; } + done: fclose(fd); return; } +static void free_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + free(info); +} + +void free_irq_db(void) +{ + for_each_irq(NULL, free_irq, NULL); + g_list_free(interrupts_db); + interrupts_db = NULL; +} void rebuild_irq_db(void) { DIR *devdir = opendir(SYSDEV_DIR); struct dirent *entry; - g_list_free_full(interrupts_db, free_int); + free_irq_db(); if (!devdir) return; @@ -278,83 +270,80 @@ void rebuild_irq_db(void) closedir(devdir); } -static GList *add_misc_irq(int irq) +struct irq_info *add_misc_irq(int irq) { - struct irq_info *new, find; + struct irq_info *new; - new = malloc(sizeof(struct irq_info)); + new = calloc(sizeof(struct irq_info), 1); if (!new) return NULL; - init_new_irq(new); new->irq = irq; - new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY; - new->property[IRQ_CLASS].iint_val = IRQ_OTHER; - new->property[IRQ_NUMA].iint_val = -1; + new->type = IRQ_TYPE_LEGACY; + new->class = IRQ_OTHER; + new->numa_node = get_numa_node(0); interrupts_db = g_list_append(interrupts_db, new); - find.irq = irq; - return g_list_find_custom(interrupts_db, &find, compare_ints); + return new; } -int find_irq_integer_prop(int irq, enum irq_prop prop) +void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data) { - GList *entry; - struct irq_info find, *result; - - find.irq = irq; + GList *entry = g_list_first(list ? list : interrupts_db); + GList *next; - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - - if (!entry) { - if (debug_mode) - printf("No entry for irq %d in the irq database, adding default entry\n", irq); - entry = add_misc_irq(irq); + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; } - - result = entry->data; - assert(result->property[prop].itype == INT_TYPE); - return result->property[prop].iint_val; } -cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop) +struct irq_info *get_irq_info(int irq) { GList *entry; - struct irq_info find, *result; - - find.irq = irq; - - entry = g_list_find_custom(interrupts_db, &find, compare_ints); - - if (!entry) { - if (debug_mode) - printf("No entry for irq %d in the irq database, adding default entry\n", irq); - entry = add_misc_irq(irq); - } - - result = entry->data; - assert(result->property[prop].itype == CPUMASK_TYPE); - return result->property[prop].imask_val; -} - -int get_next_irq(int irq) -{ - GList *entry; - struct irq_info *irqp, find; - - if (irq == -1) { - entry = g_list_first(interrupts_db); - irqp = entry->data; - return irqp->irq; - } + struct irq_info find; find.irq = irq; entry = g_list_find_custom(interrupts_db, &find, compare_ints); - if (!entry) + return entry ? entry->data : NULL; +} + +void migrate_irq(GList **from, GList **to, struct irq_info *info) +{ + GList *entry; + struct irq_info find, *tmp;; + + find.irq = info->irq; + entry = g_list_find_custom(*from, &find, compare_ints); + tmp = entry->data; + *from = g_list_delete_link(*from, entry); + + + *to = g_list_append(*to, tmp); + info->moved = 1; +} + +static gint sort_irqs(gconstpointer A, gconstpointer B) +{ + struct irq_info *a, *b; + a = (struct irq_info*)A; + b = (struct irq_info*)B; + + if (a->class < b->class) + return 1; + if (a->class > b->class) return -1; - - entry = g_list_next(entry); - if (!entry) + if (a->load < b->load) + return 1; + if (a->load > b->load) return -1; - irqp= entry->data; - return irqp->irq; + if (a= 0) { - ret = numa_node_to_cpus(node_num, node_mask); - if (ret) { - node_num--; - continue; - } - memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long)); - if (cpus_intersects(mask, cpu_node_mask)) { - numa_free_cpumask(node_mask); - return node_num; - } - node_num--; - } - - numa_free_cpumask(node_mask); - return node_num; -} - -static void fill_packages(void) +static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, + cpumask_t package_mask) { GList *entry; + struct topo_obj *package; + struct topo_obj *lcache; + + entry = g_list_first(packages); - entry = g_list_first(cache_domains); while (entry) { - struct package *package; - struct cache_domain *cache = NULL; - GList *entry2; - - cache = entry->data; - entry2 = entry; - entry = g_list_next(entry); - if (cache->marker) - continue; - package = malloc(sizeof(struct package)); - if (!package) + package = entry->data; + if (cpus_equal(package_mask, package->mask)) break; - memset(package, 0, sizeof(struct package)); - package->mask = cache->package_mask; - package->number = cache->number; - package->node_num = search_numa_node(package->mask); - while (entry2) { - struct cache_domain *cache2; - cache2 = entry2->data; - if (cpus_equal(cache->package_mask, cache2->package_mask)) { - cache2->marker = 1; - package->cache_domains = g_list_append(package->cache_domains, cache2); - if (package->number > cache2->number) - package->number = cache2->number; - } - entry2 = g_list_next(entry2); - } + entry = g_list_next(entry); + } + + if (!entry) { + package = calloc(sizeof(struct topo_obj), 1); + if (!package) + return NULL; + package->mask = package_mask; + package->obj_type = OBJ_TYPE_PACKAGE; + package->obj_type_list = &packages; packages = g_list_append(packages, package); package_count++; } -} -static void fill_cache_domain(void) + entry = g_list_first(package->children); + while (entry) { + lcache = entry->data; + if (lcache == cache) + break; + entry = g_list_next(entry); + } + + if (!entry) { + package->children = g_list_append(package->children, cache); + cache->parent = package; + } + + return package; +} +static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, + cpumask_t cache_mask) { GList *entry; + struct topo_obj *cache; + struct topo_obj *lcpu; + + entry = g_list_first(cache_domains); - entry = g_list_first(cpus); while (entry) { - struct cache_domain *cache = NULL; - struct cpu_core *cpu; - GList *entry2; - cpu = entry->data; - entry2 = entry; - entry = g_list_next(entry); - if (cpu->marker) - continue; - cache = malloc(sizeof(struct cache_domain)); - if (!cache) + cache = entry->data; + if (cpus_equal(cache_mask, cache->mask)) break; - memset(cache, 0, sizeof(struct cache_domain)); - cache->mask = cpu->cache_mask; - cache->package_mask = cpu->package_mask; - cache->number = cpu->number; - cache->node_num = search_numa_node(cache->mask); + entry = g_list_next(entry); + } + + if (!entry) { + cache = calloc(sizeof(struct topo_obj), 1); + if (!cache) + return NULL; + cache->obj_type = OBJ_TYPE_CACHE; + cache->mask = cache_mask; + cache->number = cache_domain_count; + cache->obj_type_list = &cache_domains; cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; - while (entry2) { - struct cpu_core *cpu2; - cpu2 = entry2->data; - if (cpus_equal(cpu->cache_mask, cpu2->cache_mask) && - cpus_equal(cpu->package_mask, cpu2->package_mask)) { - cpu2->marker = 1; - cache->cpu_cores = g_list_append(cache->cpu_cores, cpu2); - if (cpu2->number < cache->number) - cache->number = cpu2->number; - } - entry2 = g_list_next(entry2); - } } + + entry = g_list_first(cache->children); + while (entry) { + lcpu = entry->data; + if (lcpu == cpu) + break; + entry = g_list_next(entry); + } + + if (!entry) { + cache->children = g_list_append(cache->children, cpu); + cpu->parent = (struct topo_obj *)cache; + } + + return cache; } - - + static void do_one_cpu(char *path) { - struct cpu_core *cpu; + struct topo_obj *cpu; FILE *file; char new_path[PATH_MAX]; + cpumask_t cache_mask, package_mask; + struct topo_obj *cache; + struct topo_obj *package; + DIR *dir; + struct dirent *entry; + int nodeid; /* skip offline cpus */ snprintf(new_path, PATH_MAX, "%s/online", path); @@ -188,10 +169,11 @@ static void do_one_cpu(char *path) free(line); } - cpu = malloc(sizeof(struct cpu_core)); + cpu = calloc(sizeof(struct topo_obj), 1); if (!cpu) return; - memset(cpu, 0, sizeof(struct cpu_core)); + + cpu->obj_type = OBJ_TYPE_CPU; cpu->number = strtoul(&path[27], NULL, 10); @@ -199,9 +181,6 @@ static void do_one_cpu(char *path) cpu_set(cpu->number, cpu->mask); - /* set numa node of cpu */ - cpu->node_num = search_numa_node(cpu->mask); - /* if the cpu is on the banned list, just don't add it */ if (cpus_intersects(cpu->mask, banned_cpus)) { free(cpu); @@ -214,26 +193,26 @@ static void do_one_cpu(char *path) /* try to read the package mask; if it doesn't exist assume solitary */ snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path); file = fopen(new_path, "r"); - cpu_set(cpu->number, cpu->package_mask); + cpu_set(cpu->number, package_mask); if (file) { char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->package_mask); + cpumask_parse_user(line, strlen(line), package_mask); fclose(file); free(line); } /* try to read the cache mask; if it doesn't exist assume solitary */ /* We want the deepest cache level available so try index1 first, then index2 */ - cpu_set(cpu->number, cpu->cache_mask); + cpu_set(cpu->number, cache_mask); snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path); file = fopen(new_path, "r"); if (file) { char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->cache_mask); + cpumask_parse_user(line, strlen(line), cache_mask); fclose(file); free(line); } @@ -243,66 +222,94 @@ static void do_one_cpu(char *path) char *line = NULL; size_t size = 0; if (getline(&line, &size, file)) - cpumask_parse_user(line, strlen(line), cpu->cache_mask); + cpumask_parse_user(line, strlen(line), cache_mask); fclose(file); free(line); } + nodeid=0; + dir = opendir(path); + do { + entry = readdir(dir); + if (!entry) + break; + if (strstr(entry->d_name, "node")) { + nodeid = strtoul(&entry->d_name[4], NULL, 10); + break; + } + } while (entry); + closedir(dir); + + cache = add_cpu_to_cache_domain(cpu, cache_mask); + package = add_cache_domain_to_package(cache, package_mask); + add_package_to_node(package, nodeid); + /* blank out the banned cpus from the various masks so that interrupts will never be told to go there */ - cpus_and(cpu->cache_mask, cpu->cache_mask, unbanned_cpus); - cpus_and(cpu->package_mask, cpu->package_mask, unbanned_cpus); + cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus); + cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus); cpus_and(cpu->mask, cpu->mask, unbanned_cpus); + cpu->obj_type_list = &cpus; cpus = g_list_append(cpus, cpu); core_count++; } -static void dump_irqs(int spaces, GList *dump_interrupts) +static void dump_irq(struct irq_info *info, void *data) { - struct interrupt *irq; - while (dump_interrupts) { - int i; - for (i=0; idata; - printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload); - dump_interrupts = g_list_next(dump_interrupts); - } + int spaces = (long int)data; + int i; + for (i=0; iirq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load); +} + +static void dump_topo_obj(struct topo_obj *d, void *data __attribute__((unused))) +{ + struct topo_obj *c = (struct topo_obj *)d; + printf(" CPU number %i numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load); + if (c->interrupts) + for_each_irq(c->interrupts, dump_irq, (void *)18); +} + +static void dump_cache_domain(struct topo_obj *d, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4095, d->mask); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load); + if (d->children) + for_each_object(d->children, dump_topo_obj, NULL); + if (d->interrupts) + for_each_irq(d->interrupts, dump_irq, (void *)10); +} + +static void dump_package(struct topo_obj *d, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4096, d->mask); + printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load); + if (d->children) + for_each_object(d->children, dump_cache_domain, buffer); + if (d->interrupts) + for_each_irq(d->interrupts, dump_irq, (void *)2); } void dump_tree(void) { - GList *p_iter, *c_iter, *cp_iter; - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - char buffer[4096]; - p_iter = g_list_first(packages); - while (p_iter) { - package = p_iter->data; - cpumask_scnprintf(buffer, 4096, package->mask); - printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload); - c_iter = g_list_first(package->cache_domains); - while (c_iter) { - cache_domain = c_iter->data; - c_iter = g_list_next(c_iter); - cpumask_scnprintf(buffer, 4095, cache_domain->mask); - printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload); - cp_iter = cache_domain->cpu_cores; - while (cp_iter) { - cpu = cp_iter->data; - cp_iter = g_list_next(cp_iter); - printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload); - dump_irqs(18, cpu->interrupts); - } - dump_irqs(10, cache_domain->interrupts); - } - dump_irqs(2, package->interrupts); - p_iter = g_list_next(p_iter); - } + for_each_object(packages, dump_package, buffer); +} + +static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) +{ + info->load = 0; +} + +static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused))) +{ + for_each_object(d->children, clear_obj_stats, NULL); + for_each_irq(d->interrupts, clear_irq_stats, NULL); } /* @@ -310,40 +317,9 @@ void dump_tree(void) * which level does how much work and the actual lists of interrupts * assigned to each component */ -void clear_work_stats(void) +void clear_work_stats() { - GList *p_iter, *c_iter, *cp_iter; - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - - p_iter = g_list_first(packages); - while (p_iter) { - package = p_iter->data; - package->workload = 0; - g_list_free(package->interrupts); - package->interrupts = NULL; - c_iter = g_list_first(package->cache_domains); - memset(package->class_count, 0, sizeof(package->class_count)); - while (c_iter) { - cache_domain = c_iter->data; - c_iter = g_list_next(c_iter); - cache_domain->workload = 0; - cp_iter = cache_domain->cpu_cores; - g_list_free(cache_domain->interrupts); - cache_domain->interrupts = NULL; - memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count)); - while (cp_iter) { - cpu = cp_iter->data; - cp_iter = g_list_next(cp_iter); - cpu->workload = 0; - g_list_free(cpu->interrupts); - cpu->interrupts = NULL; - memset(cpu->class_count, 0, sizeof(cpu->class_count)); - } - } - p_iter = g_list_next(p_iter); - } + for_each_object(numa_nodes, clear_obj_stats, NULL); } @@ -373,9 +349,6 @@ void parse_cpu_tree(void) } while (entry); closedir(dir); - fill_cache_domain(); - fill_packages(); - if (debug_mode) dump_tree(); @@ -389,14 +362,14 @@ void parse_cpu_tree(void) void clear_cpu_tree(void) { GList *item; - struct cpu_core *cpu; - struct cache_domain *cache_domain; - struct package *package; + struct topo_obj *cpu; + struct topo_obj *cache_domain; + struct topo_obj *package; while (packages) { item = g_list_first(packages); package = item->data; - g_list_free(package->cache_domains); + g_list_free(package->children); g_list_free(package->interrupts); free(package); packages = g_list_delete_link(packages, item); @@ -406,7 +379,7 @@ void clear_cpu_tree(void) while (cache_domains) { item = g_list_first(cache_domains); cache_domain = item->data; - g_list_free(cache_domain->cpu_cores); + g_list_free(cache_domain->children); g_list_free(cache_domain->interrupts); free(cache_domain); cache_domains = g_list_delete_link(cache_domains, item); @@ -424,3 +397,28 @@ void clear_cpu_tree(void) core_count = 0; } + +static gint compare_cpus(gconstpointer a, gconstpointer b) +{ + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; + + return ai->number - bi->number; +} + +struct topo_obj *find_cpu_core(int cpunr) +{ + GList *entry; + struct topo_obj find; + + find.number = cpunr; + entry = g_list_find_custom(cpus, &find, compare_cpus); + + return entry ? entry->data : NULL; +} + +int get_cpu_count(void) +{ + return g_list_length(cpus); +} + diff --git a/irqbalance.1 b/irqbalance.1 index c26c709..8fbc48d 100755 --- a/irqbalance.1 +++ b/irqbalance.1 @@ -41,6 +41,19 @@ Causes irqbalance to be run once, after which the daemon exits .B --debug Causes irqbalance to run in the foreground and extra debug information to be printed +.TP +.B --hintpolicy=[exact | subset | ignore] +Set the policy for how irq kernel affinity hinting is treated. Can be one of: +.P +.I exact +irq affinity hint is applied unilaterally and never violated +.P +.I subset +irq is balanced, but the assigned object will be a subset of the affintiy hint +.P +.I ignore +irq affinity hint value is completely ignored + .SH "ENVIRONMENT VARIABLES" .TP .B IRQBALANCE_ONESHOT diff --git a/irqbalance.c b/irqbalance.c index 8ae8197..5bab859 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -38,13 +38,11 @@ int one_shot_mode; int debug_mode; int numa_avail; - int need_cpu_rescan; - extern cpumask_t banned_cpus; - -static int counter; - +enum hp_e hint_policy = HINT_POLICY_SUBSET; +unsigned long power_thresh = ULONG_MAX; +unsigned long long cycle_count = 0; void sleep_approx(int seconds) { @@ -64,12 +62,15 @@ void sleep_approx(int seconds) struct option lopts[] = { {"oneshot", 0, NULL, 'o'}, {"debug", 0, NULL, 'd'}, + {"hintpolicy", 1, NULL, 'h'}, + {"powerthresh", 1, NULL, 'p'}, {0, 0, 0, 0} }; static void usage(void) { - printf("irqbalance [--oneshot | -o] [--debug | -d]"); + printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]\n"); + printf(" [--powerthresh= | -p | ]\n"); } static void parse_command_line(int argc, char **argv) @@ -78,7 +79,7 @@ static void parse_command_line(int argc, char **argv) int longind; while ((opt = getopt_long(argc, argv, - "", + "odh:p:", lopts, &longind)) != -1) { switch(opt) { @@ -88,6 +89,29 @@ static void parse_command_line(int argc, char **argv) case 'd': debug_mode=1; break; + case 'h': + if (!strncmp(optarg, "exact", strlen(optarg))) + hint_policy = HINT_POLICY_EXACT; + else if (!strncmp(optarg, "subset", strlen(optarg))) + hint_policy = HINT_POLICY_SUBSET; + else if (!strncmp(optarg, "ignore", strlen(optarg))) + hint_policy = HINT_POLICY_IGNORE; + else { + usage(); + exit(1); + } + break; + case 'p': + if (!strncmp(optarg, "off", strlen(optarg))) + power_thresh = ULONG_MAX; + else { + power_thresh = strtoull(optarg, NULL, 10); + if (power_thresh == ULONG_MAX) { + usage(); + exit(1); + } + } + break; case 'o': one_shot_mode=1; break; @@ -96,6 +120,50 @@ static void parse_command_line(int argc, char **argv) } #endif +/* + * This builds our object tree. The Heirarchy is pretty straightforward + * At the top are numa_nodes + * All CPU packages belong to a single numa_node + * All Cache domains belong to a CPU package + * All CPU cores belong to a cache domain + * + * Objects are built in that order (top down) + * + * Object workload is the aggregate sum of the + * workload of the objects below it + */ +static void build_object_tree() +{ + build_numa_node_list(); + parse_cpu_tree(); + rebuild_irq_db(); +} + +static void free_object_tree() +{ + free_numa_node_list(); + clear_cpu_tree(); + free_irq_db(); +} + +static void dump_object_tree() +{ + for_each_object(numa_nodes, dump_numa_node_info, NULL); +} + +static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + if (info->level == BALANCE_NONE) + return; + + if (info->assigned_obj == NULL) + rebalance_irq_list = g_list_append(rebalance_irq_list, info); + else + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + + info->assigned_obj = NULL; +} + int main(int argc, char** argv) { @@ -125,9 +193,9 @@ int main(int argc, char** argv) } - rebuild_irq_db(); - - parse_cpu_tree(); + build_object_tree(); + if (debug_mode) + dump_object_tree(); /* On single core UP systems irqbalance obviously has no work to do */ @@ -150,15 +218,10 @@ int main(int argc, char** argv) capng_apply(CAPNG_SELECT_BOTH); #endif + for_each_irq(NULL, force_rebalance_irq, NULL); + parse_proc_interrupts(); - sleep(SLEEP_INTERVAL/4); - reset_counts(); - parse_proc_interrupts(); - pci_numa_scan(); - calculate_workload(); - sort_irq_list(); - if (debug_mode) - dump_workloads(); + parse_proc_stat(); while (1) { sleep_approx(SLEEP_INTERVAL); @@ -166,8 +229,9 @@ int main(int argc, char** argv) printf("\n\n\n-----------------------------------------------------------------------------\n"); - check_power_mode(); + clear_work_stats(); parse_proc_interrupts(); + parse_proc_stat(); /* cope with cpu hotplug -- detected during /proc/interrupts parsing */ if (need_cpu_rescan) { @@ -179,25 +243,31 @@ int main(int argc, char** argv) reset_counts(); clear_work_stats(); - clear_cpu_tree(); - parse_cpu_tree(); - } + free_object_tree(); + build_object_tree(); + for_each_irq(NULL, force_rebalance_irq, NULL); + parse_proc_interrupts(); + parse_proc_stat(); + sleep_approx(SLEEP_INTERVAL); + clear_work_stats(); + parse_proc_interrupts(); + parse_proc_stat(); + cycle_count=0; + } - calculate_workload(); - - /* to cope with dynamic configurations we scan for new numa information - * once every 5 minutes - */ - pci_numa_scan(); + if (cycle_count) + update_migration_status(); calculate_placement(); - activate_mapping(); + activate_mappings(); if (debug_mode) dump_tree(); if (one_shot_mode) break; - counter++; + cycle_count++; + } + free_object_tree(); return EXIT_SUCCESS; } diff --git a/irqbalance.h b/irqbalance.h index 3e76353..a1b1e8a 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -8,47 +8,116 @@ #include #include +#include #include "types.h" #include -struct interrupt; - extern int package_count; extern int cache_domain_count; extern int core_count; extern char *classes[]; -extern int map_class_to_level[7]; -extern int class_counts[7]; -extern int debug_mode; -extern int power_mode; -extern int need_cpu_rescan; -extern int one_shot_mode; -extern GList *interrupts; extern void parse_cpu_tree(void); extern void clear_work_stats(void); extern void parse_proc_interrupts(void); -extern void rebuild_irq_db(void); +extern void parse_proc_stat(void); extern void set_interrupt_count(int number, uint64_t count); extern void set_msi_interrupt_numa(int number); -extern int get_next_irq(int irq); -extern int find_irq_integer_prop(int irq, enum irq_prop prop); -extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop); -extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); +extern GList *rebalance_irq_list; -void calculate_workload(void); +void update_migration_status(void); void reset_counts(void); void dump_workloads(void); -void sort_irq_list(void); +void sort_irq_list(GList **list); void calculate_placement(void); void dump_tree(void); -void activate_mapping(void); +void activate_mappings(void); void account_for_nic_stats(void); void check_power_mode(void); void clear_cpu_tree(void); void pci_numa_scan(void); +/*===================NEW BALANCER FUNCTIONS============================*/ + +/* + * Master topo_obj type lists + */ +extern GList *numa_nodes; +extern GList *packages; +extern GList *cache_domains; +extern GList *cpus; + +enum hp_e { + HINT_POLICY_IGNORE, + HINT_POLICY_SUBSET, + HINT_POLICY_EXACT +}; + +extern int debug_mode; +extern int one_shot_mode; +extern int power_mode; +extern int need_cpu_rescan; +extern enum hp_e hint_policy; +extern unsigned long long cycle_count; +extern unsigned long power_thresh; + +/* + * Numa node access routines + */ +extern void build_numa_node_list(void); +extern void free_numa_node_list(void); +extern void dump_numa_node_info(struct topo_obj *node, void *data); +extern void add_package_to_node(struct topo_obj *p, int nodeid); +extern struct topo_obj *get_numa_node(int nodeid); + +/* + * Package functions + */ +#define package_numa_node(p) ((p)->parent) + +/* + * cache_domain functions + */ +#define cache_domain_package(c) ((c)->parent) +#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c)))) + +/* + * cpu core functions + */ +#define cpu_cache_domain(cpu) ((cpu)->parent) +#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu)))) +#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu))))) +extern struct topo_obj *find_cpu_core(int cpunr); +extern int get_cpu_count(void); + +/* + * irq db functions + */ +extern void rebuild_irq_db(void); +extern void free_irq_db(void); +extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data); +extern struct irq_info *get_irq_info(int irq); +extern void migrate_irq(GList **from, GList **to, struct irq_info *info); +extern struct irq_info *add_misc_irq(int irq); +#define irq_numa_node(irq) ((irq)->numa_node) + + +/* + * Generic object functions + */ +static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data) +{ + GList *entry, *next; + entry = g_list_first(list); + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + #endif + diff --git a/irqlist.c b/irqlist.c index 6ea48b1..c82131a 100644 --- a/irqlist.c +++ b/irqlist.c @@ -29,285 +29,183 @@ #include #include #include +#include #include "types.h" #include "irqbalance.h" -GList *interrupts; +struct load_balance_info { + unsigned long long int total_load; + unsigned long long avg_load; + int load_sources; + unsigned long long int deviations; + long double std_deviation; + unsigned int num_within; + unsigned int num_over; + unsigned int num_under; + struct topo_obj *powersave; +}; -void get_affinity_hint(struct interrupt *irq, int number) +static void gather_load_stats(struct topo_obj *obj, void *data) { - char buf[PATH_MAX]; - cpumask_t tempmask; - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/affinity_hint", number); - file = fopen(buf, "r"); - if (!file) - return; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - return; - } - cpumask_parse_user(line, strlen(line), tempmask); - if (!__cpus_full(&tempmask, num_possible_cpus())) - irq->node_mask = tempmask; - fclose(file); - free(line); + struct load_balance_info *info = data; + + info->total_load += obj->load; + info->load_sources += 1; } -/* - * This function classifies and reads various things from /proc about a specific irq - */ -static void investigate(struct interrupt *irq, int number) +static void compute_deviations(struct topo_obj *obj, void *data) { - DIR *dir; - struct dirent *entry; - char *c, *c2; - int nr , count = 0, can_set = 1; - char buf[PATH_MAX]; - sprintf(buf, "/proc/irq/%i", number); - dir = opendir(buf); - do { - entry = readdir(dir); - if (!entry) - break; - if (strcmp(entry->d_name,"smp_affinity")==0) { - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/smp_affinity", number); - file = fopen(buf, "r+"); - if (!file) - continue; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - continue; - } - cpumask_parse_user(line, strlen(line), irq->mask); - /* - * Check that we can write the affinity, if - * not take it out of the list. - */ - fputs(line, file); - if (fclose(file) && errno == EIO) - can_set = 0; - free(line); - } else if (strcmp(entry->d_name,"allowed_affinity")==0) { - char *line = NULL; - size_t size = 0; - FILE *file; - sprintf(buf, "/proc/irq/%i/allowed_affinity", number); - file = fopen(buf, "r"); - if (!file) - continue; - if (getline(&line, &size, file)==0) { - free(line); - fclose(file); - continue; - } - cpumask_parse_user(line, strlen(line), irq->allowed_mask); - fclose(file); - free(line); - } else if (strcmp(entry->d_name,"affinity_hint")==0) { - get_affinity_hint(irq, number); - } else { - irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS); - } + struct load_balance_info *info = data; + unsigned long long int deviation; - } while (entry); - closedir(dir); - irq->balance_level = map_class_to_level[irq->class]; + deviation = (obj->load > info->avg_load) ? + obj->load - info->avg_load : + info->avg_load - obj->load; - for (nr = 0; nr < NR_CPUS; nr++) - if (cpu_isset(nr, irq->allowed_mask)) - count++; - - /* if there is no choice in the allowed mask, don't bother to balance */ - if ((count<2) || (can_set == 0)) - irq->balance_level = BALANCE_NONE; - - - /* next, check the IRQBALANCE_BANNED_INTERRUPTS env variable for blacklisted irqs */ - c = c2 = getenv("IRQBALANCE_BANNED_INTERRUPTS"); - if (!c) - return; - - do { - c = c2; - nr = strtoul(c, &c2, 10); - if (c!=c2 && nr == number) - irq->balance_level = BALANCE_NONE; - } while (c!=c2 && c2!=NULL); + info->deviations += (deviation * deviation); } -/* Set numa node number for MSI interrupt; - * Assumes existing irq metadata - */ -void set_msi_interrupt_numa(int number) +static void move_candidate_irqs(struct irq_info *info, void *data) { - GList *item; - struct interrupt *irq; - int node; + int *remaining_deviation = (int *)data; - node = find_irq_integer_prop(number, IRQ_NUMA); - if (node < 0) - return; - - item = g_list_first(interrupts); - while (item) { - irq = item->data; - - if (irq->number == number) { - irq->node_num = node; - irq->msi = 1; + /* never move an irq that has an afinity hint when + * hint_policy is HINT_POLICY_EXACT + */ + if (hint_policy == HINT_POLICY_EXACT) + if (!cpus_empty(info->affinity_hint)) return; - } - item = g_list_next(item); - } -} -/* - * Set the number of interrupts received for a specific irq; - * create the irq metadata if there is none yet - */ -void set_interrupt_count(int number, uint64_t count) -{ - GList *item; - struct interrupt *irq; - - if (count < MIN_IRQ_COUNT && !one_shot_mode) - return; /* no need to track or set interrupts sources without any activity since boot - but allow for a few (20) boot-time-only interrupts */ - - item = g_list_first(interrupts); - while (item) { - irq = item->data; - - if (irq->number == number) { - irq->count = count; - /* see if affinity_hint changed */ - get_affinity_hint(irq, number); - return; - } - item = g_list_next(item); - } - /* new interrupt */ - irq = malloc(sizeof(struct interrupt)); - if (!irq) + /* Don't rebalance irqs that don't want it */ + if (info->level == BALANCE_NONE) return; - memset(irq, 0, sizeof(struct interrupt)); - irq->node_num = -1; - irq->number = number; - irq->count = count; - irq->allowed_mask = CPU_MASK_ALL; - investigate(irq, number); - interrupts = g_list_append(interrupts, irq); + + /* Don't move cpus that only have one irq, regardless of load */ + if (g_list_length(info->assigned_obj->interrupts) <= 1) + return; + + /* Stop rebalancing if we've estimated a full reduction of deviation */ + if (*remaining_deviation <= 0) + return; + + *remaining_deviation -= info->load; + + if (debug_mode) + printf("Selecting irq %d for rebalancing\n", info->irq); + + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + + info->assigned_obj = NULL; } -/* - * Set the numa affinity mask for a specific interrupt if there - * is metadata for the interrupt; do nothing if no such data - * exists. - */ -void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type) +static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) { - GList *item; - struct interrupt *irq; + struct load_balance_info *info = data; + int deviation; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); + /* + * Don't rebalance irqs on objects whos load is below the average + */ + if (obj->load <= info->avg_load) { + if ((obj->load + info->std_deviation) <= info->avg_load) { + info->num_under++; + info->powersave = obj; + } else + info->num_within++; + return; + } - if (irq->number == number) { - cpus_or(irq->numa_mask, irq->numa_mask, mask); - irq->node_num = node_num; - if (irq->class < type && irq->balance_level != BALANCE_NONE) { - irq->class = type; - irq->balance_level = map_class_to_level[irq->class]; - } - return; + deviation = obj->load - info->avg_load; + + if ((deviation > info->std_deviation) && + (g_list_length(obj->interrupts) > 1)) { + + info->num_over++; + /* + * We have a cpu that is overloaded and + * has irqs that can be moved to fix that + */ + + /* order the list from least to greatest workload */ + sort_irq_list(&obj->interrupts); + /* + * Each irq carries a weighted average amount of load + * we think its responsible for. Set deviation to be the load + * of the difference between this objects load and the averate, + * and migrate irqs until we only have one left, or until that + * difference reaches zero + */ + for_each_irq(obj->interrupts, move_candidate_irqs, &deviation); + } else + info->num_within++; + +} + +static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused))) +{ + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); +} + +static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused))) +{ + obj->powersave_mode = 0; +} + +#define find_overloaded_objs(name, info) do {\ + int ___load_sources;\ + memset(&(info), 0, sizeof(struct load_balance_info));\ + for_each_object((name), gather_load_stats, &(info));\ + (info).avg_load = (info).total_load / (info).load_sources;\ + for_each_object((name), compute_deviations, &(info));\ + ___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\ + (info).std_deviation = (long double)((info).deviations / ___load_sources);\ + (info).std_deviation = sqrt((info).std_deviation);\ + for_each_object((name), migrate_overloaded_irqs, &(info));\ +}while(0) + +void update_migration_status(void) +{ + struct load_balance_info info; + + find_overloaded_objs(cpus, info); + if (cycle_count > 5) { + if (!info.num_over && (info.num_under >= power_thresh)) { + syslog(LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number); + info.powersave->powersave_mode = 1; + for_each_irq(info.powersave->interrupts, force_irq_migration, NULL); + } else if (info.num_over) { + syslog(LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n"); + for_each_object(cpus, clear_powersave_mode, NULL); } } + find_overloaded_objs(cache_domains, info); + find_overloaded_objs(packages, info); + find_overloaded_objs(numa_nodes, info); } -void calculate_workload(void) + +static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused))) { - int i; - GList *item; - struct interrupt *irq; - - for (i=0; i<7; i++) - class_counts[i]=0; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - - irq->workload = irq->count - irq->old_count + irq->workload/3 + irq->extra; - class_counts[irq->class]++; - irq->old_count = irq->count; - irq->extra = 0; - } + info->last_irq_count = info->irq_count; + info->irq_count = 0; } void reset_counts(void) { - GList *item; - struct interrupt *irq; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - irq->old_count = irq->count; - irq->extra = 0; + for_each_irq(NULL, reset_irq_count, NULL); +} - } + +static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) +{ + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load); } void dump_workloads(void) { - GList *item; - struct interrupt *irq; - item = g_list_first(interrupts); - while (item) { - irq = item->data; - item = g_list_next(item); - - printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload); - - } + for_each_irq(NULL, dump_workload, NULL); } - -static gint sort_irqs(gconstpointer A, gconstpointer B) -{ - struct interrupt *a, *b; - a = (struct interrupt*)A; - b = (struct interrupt*)B; - - if (a->class < b->class) - return 1; - if (a->class > b->class) - return -1; - if (a->workload < b->workload) - return 1; - if (a->workload > b->workload) - return -1; - if (alow) and then by workload (high->low) */ - interrupts = g_list_sort(interrupts, sort_irqs); -} diff --git a/numa.c b/numa.c index 51ee88c..6e00243 100644 --- a/numa.c +++ b/numa.c @@ -33,24 +33,130 @@ #include "irqbalance.h" -void pci_numa_scan(void) +#define SYSFS_NODE_PATH "/sys/devices/system/node" + +GList *numa_nodes = NULL; + +struct topo_obj unspecified_node = { + .load = 0, + .number = -1, + .obj_type = OBJ_TYPE_NODE, + .mask = CPU_MASK_ALL, + .interrupts = NULL, + .children = NULL, + .parent = NULL, + .obj_type_list = &numa_nodes, +}; + +static void add_one_node(const char *nodename) { - int irq = -1; - cpumask_t mask; - int node_num; - do { - int type; - irq = get_next_irq(irq); - if (irq == -1) - break; + char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1); + struct topo_obj *new; + char *cpustr; + FILE *f; - mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK); - - node_num = find_irq_integer_prop(irq, IRQ_NUMA); - - type = find_irq_integer_prop(irq, IRQ_CLASS); - - add_interrupt_numa(irq, mask, node_num, type); - - } while (irq != -1); + if (!path) + return; + new = calloc(1, sizeof(struct topo_obj)); + if (!new) + return; + sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename); + f = fopen(path, "r"); + if (ferror(f)) { + cpus_clear(new->mask); + } else { + fscanf(f, "%as", &cpustr); + if (!cpustr) { + cpus_clear(new->mask); + } else { + cpumask_parse_user(cpustr, strlen(cpustr), new->mask); + free(cpustr); + } + } + new->obj_type = OBJ_TYPE_NODE; + new->number = strtoul(&nodename[4], NULL, 10); + new->obj_type_list = &numa_nodes; + numa_nodes = g_list_append(numa_nodes, new); } + +void build_numa_node_list(void) +{ + DIR *dir = opendir(SYSFS_NODE_PATH); + struct dirent *entry; + + do { + entry = readdir(dir); + if (!entry) + break; + if ((entry->d_type == DT_DIR) && (strstr(entry->d_name, "node"))) { + add_one_node(entry->d_name); + } + } while (entry); +} + +static void free_numa_node(gpointer data) +{ + free(data); +} + +void free_numa_node_list(void) +{ + g_list_free_full(numa_nodes, free_numa_node); + numa_nodes = NULL; +} + +static gint compare_node(gconstpointer a, gconstpointer b) +{ + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; + + return (ai->number == bi->number) ? 0 : 1; +} + +void add_package_to_node(struct topo_obj *p, int nodeid) +{ + struct topo_obj find, *node; + find.number = nodeid; + GList *entry; + + find.number = nodeid; + entry = g_list_find_custom(numa_nodes, &find, compare_node); + + if (!entry) { + if (debug_mode) + printf("Could not find numa node for node id %d\n", nodeid); + return; + } + + node = entry->data; + + if (!p->parent) { + node->children = g_list_append(node->children, p); + p->parent = node; + } +} + +void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused))) +{ + char buffer[4096]; + + printf("NUMA NODE NUMBER: %d\n", d->number); + cpumask_scnprintf(buffer, 4096, d->mask); + printf("LOCAL CPU MASK: %s\n", buffer); + printf("\n"); +} + +struct topo_obj *get_numa_node(int nodeid) +{ + struct topo_obj find; + GList *entry; + + if (nodeid == -1) + return &unspecified_node; + + find.number = nodeid; + + entry = g_list_find_custom(numa_nodes, &find, compare_node); + return entry ? entry->data : NULL; +} + diff --git a/placement.c b/placement.c index 828ce8e..cfa419e 100644 --- a/placement.c +++ b/placement.c @@ -30,355 +30,167 @@ int power_mode; -extern GList *interrupts, *packages, *cache_domains, *cpus; +GList *rebalance_irq_list; -static uint64_t package_cost_func(struct interrupt *irq, struct package *package) +struct obj_placement { + struct topo_obj *best; + struct topo_obj *least_irqs; + uint64_t best_cost; + struct irq_info *info; +}; + +static void find_best_object(struct topo_obj *d, void *data) { - int bonus = 0; - int maxcount; - int dist; - /* moving to a cold package/cache/etc gets you a 3000 penalty */ - if (!cpus_intersects(irq->old_mask, package->mask)) - bonus = CROSS_PACKAGE_PENALTY; + struct obj_placement *best = (struct obj_placement *)data; + uint64_t newload; + cpumask_t subset; - /* do a little numa affinity */ - if (irq->node_num != package->node_num) { - if (irq->node_num >= 0 && package->node_num >= 0) { - dist = numa_distance(irq->node_num, package->node_num); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + /* + * If the hint policy is subset, then we only want + * to consider objects that are within the irqs hint, but + * only if that irq in fact has published a hint + */ + if (hint_policy == HINT_POLICY_SUBSET) { + if (!cpus_empty(best->info->affinity_hint)) { + cpus_and(subset, best->info->affinity_hint, d->mask); + if (cpus_empty(subset)) + return; } } - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; + if (d->powersave_mode) + return; - /* in power save mode, you better be on package 0, with overflow to the next package if really needed */ - if (power_mode) - bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number; - - /* if we're out of whack in terms of per class counts.. just block (except in power mode) */ - maxcount = (class_counts[irq->class] + package_count -1 ) / package_count; - if (package->class_count[irq->class]>=maxcount && !power_mode) - bonus += 300000; - - /* if the package has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, package->mask)) - bonus += 600000; - - return irq->workload + bonus; -} - -static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain) -{ - int bonus = 0; - int dist; - - /* moving to a cold cache gets you a 1500 penalty */ - if (!cpus_intersects(irq->old_mask, cache_domain->mask)) - bonus = CROSS_PACKAGE_PENALTY/2; - - /* do a little numa affinity */ - if (irq->node_num != cache_domain->node_num) { - if (irq->node_num >= 0 && cache_domain->node_num >= 0) { - dist = numa_distance(irq->node_num, cache_domain->node_num); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; - } + newload = d->load; + if (newload < best->best_cost) { + best->best = d; + best->best_cost = newload; + best->least_irqs = NULL; } - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; - - - /* pay 6000 for each previous interrupt of the same class */ - bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class]; - - /* try to avoid having a lot of MSI interrupt (globally, no by devide id) on - * cache domain */ - if (irq->msi == 1) - bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class]; - - /* if the cache domain has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, cache_domain->mask)) - bonus += 600000; - - return irq->workload + bonus; -} - -static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) -{ - int bonus = 0; - int dist; - - /* moving to a colder core gets you a 1000 penalty */ - if (!cpus_intersects(irq->old_mask, cpu->mask)) - bonus = CROSS_PACKAGE_PENALTY/3; - - /* do a little numa affinity */ - if (irq->node_num != cpu->node_num) { - if (irq->node_num >= 0 && cpu->node_num >= 0) { - dist = numa_distance(irq->node_num, cpu->node_num); - /* moving to a distant numa node results into penalty */ - bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; - } - } - - /* but if the irq has had 0 interrupts for a while move it about more easily */ - if (irq->workload==0) - bonus = bonus / 10; - - /* - * since some chipsets only place at the first cpu, give a tiny preference to non-first - * cpus for specifically placed interrupts - */ - if (first_cpu(cpu->cache_mask)==cpu->number) - bonus++; - - /* pay 6000 for each previous interrupt of the same class */ - bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class]; - - /* if the core has no cpus in the allowed mask.. just block */ - if (!cpus_intersects(irq->allowed_mask, cpu->mask)) - bonus += 600000; - - return irq->workload + bonus; -} - - -static void place_cache_domain(struct package *package) -{ - GList *iter, *next; - GList *pkg; - struct interrupt *irq; - struct cache_domain *cache_domain; - - - iter = g_list_first(package->interrupts); - while (iter) { - struct cache_domain *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; - - if (irq->balance_level <= BALANCE_PACKAGE) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(package->cache_domains); - while (pkg) { - uint64_t newload; - - cache_domain = pkg->data; - newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain); - if (newload < best_cost) { - best = cache_domain; - best_cost = newload; - } - - pkg = g_list_next(pkg); - } - if (best) { - next = g_list_next(iter); - package->interrupts = g_list_delete_link(package->interrupts, iter); - - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - iter = next; - } else - iter = g_list_next(iter); + if (newload == best->best_cost) { + if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts)) + best->least_irqs = d; } } - -static void place_core(struct cache_domain *cache_domain) +static void find_best_object_for_irq(struct irq_info *info, void *data) { - GList *iter, *next; - GList *pkg; - struct interrupt *irq; - struct cpu_core *cpu; + struct obj_placement place; + struct topo_obj *d = data; + struct topo_obj *asign; + if (!info->moved) + return; - iter = g_list_first(cache_domain->interrupts); - while (iter) { - struct cpu_core *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; + switch (d->obj_type) { + case OBJ_TYPE_NODE: + if (info->level == BALANCE_NONE) + return; + break; - /* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */ - if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(cache_domain->cpu_cores); - while (pkg) { - uint64_t newload; + case OBJ_TYPE_PACKAGE: + if (info->level == BALANCE_PACKAGE) + return; + break; - cpu = pkg->data; - newload = cpu->workload + cpu_cost_func(irq, cpu); - if (newload < best_cost) { - best = cpu; - best_cost = newload; - } + case OBJ_TYPE_CACHE: + if (info->level == BALANCE_CACHE) + return; + break; - pkg = g_list_next(pkg); - } - if (best) { - next = g_list_next(iter); - cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter); - - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - iter = next; - } else - iter = g_list_next(iter); + case OBJ_TYPE_CPU: + if (info->level == BALANCE_CORE) + return; + break; + } + + place.info = info; + place.best = NULL; + place.least_irqs = NULL; + place.best_cost = INT_MAX; + + for_each_object(d->children, find_best_object, &place); + + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&d->interrupts, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; } } - -static void place_packages(GList *list) +static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused))) { - GList *iter; - GList *pkg; - struct interrupt *irq; - struct package *package; + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, find_best_object_for_irq, d); +} +static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) +{ + struct obj_placement place; + struct topo_obj *asign; - iter = g_list_first(list); - while (iter) { - struct package *best = NULL; - uint64_t best_cost = INT_MAX; - irq = iter->data; - if (irq->balance_level == BALANCE_NONE) { - iter = g_list_next(iter); - continue; - } - pkg = g_list_first(packages); - while (pkg) { - uint64_t newload; + if( info->level == BALANCE_NONE) + return; - package = pkg->data; - newload = package->workload + package_cost_func(irq, package); - if (newload < best_cost) { - best = package; - best_cost = newload; - } + if (irq_numa_node(info)->number != -1) { + /* + * This irq belongs to a device with a preferred numa node + * put it on that node + */ + migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info); + info->assigned_obj = irq_numa_node(info); + irq_numa_node(info)->load += info->load + 1; + return; + } - pkg = g_list_next(pkg); - } - if (best) { - best->workload += irq->workload + 1; - best->interrupts=g_list_append(best->interrupts, irq); - best->class_count[irq->class]++; - irq->mask = best->mask; - } - iter = g_list_next(iter); + place.best_cost = INT_MAX; + place.best = NULL; + place.least_irqs = NULL; + place.info = info; + + for_each_object(numa_nodes, find_best_object, &place); + + asign = place.least_irqs ? place.least_irqs : place.best; + + if (asign) { + migrate_irq(&rebalance_irq_list, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; } } - -static void place_affinity_hint(GList *list) +static void validate_irq(struct irq_info *info, void *data) { - /* still need to balance best workload within the affinity_hint mask */ - GList *iter; - struct interrupt *irq; - - iter = g_list_first(list); - while (iter) { - irq = iter->data; - if (irq->balance_level == BALANCE_NONE) { - iter = g_list_next(iter); - continue; - } - if ((!cpus_empty(irq->node_mask)) && - (!cpus_equal(irq->mask, irq->node_mask)) && - (!__cpus_full(&irq->node_mask, num_possible_cpus()))) { - irq->old_mask = irq->mask; - irq->mask = irq->node_mask; - } - - iter = g_list_next(iter); - } + if (info->assigned_obj != data) + printf("object validation error: irq %d is wrong, points to %p, should be %p\n", + info->irq, info->assigned_obj, data); } - -static void do_unroutables(void) +static void validate_object(struct topo_obj *d, void *data __attribute__((unused))) { - struct package *package; - struct cache_domain *cache_domain; - struct cpu_core *cpu; - struct interrupt *irq; - GList *iter, *inter; - - inter = g_list_first(interrupts); - while (inter) { - irq = inter->data; - inter = g_list_next(inter); - if (irq->balance_level != BALANCE_NONE) - continue; - - iter = g_list_first(packages); - while (iter) { - package = iter->data; - if (cpus_intersects(package->mask, irq->node_mask) || - cpus_intersects(package->mask, irq->mask)) - package->workload += irq->workload; - iter = g_list_next(iter); - } - - iter = g_list_first(cache_domains); - while (iter) { - cache_domain = iter->data; - if (cpus_intersects(cache_domain->mask, irq->node_mask) - || cpus_intersects(cache_domain->mask, irq->mask)) - cache_domain->workload += irq->workload; - iter = g_list_next(iter); - } - iter = g_list_first(cpus); - while (iter) { - cpu = iter->data; - if (cpus_intersects(cpu->mask, irq->node_mask) || - cpus_intersects(cpu->mask, irq->mask)) - cpu->workload += irq->workload; - iter = g_list_next(iter); - } - } + if (d->interrupts) + for_each_irq(d->interrupts, validate_irq, d); } +static void validate_object_tree_placement() +{ + for_each_object(packages, validate_object, NULL); + for_each_object(cache_domains, validate_object, NULL); + for_each_object(cpus, validate_object, NULL); +} void calculate_placement(void) { - struct package *package; - struct cache_domain *cache_domain; - GList *iter; - /* first clear old data */ - clear_work_stats(); - sort_irq_list(); - do_unroutables(); - - place_packages(interrupts); - iter = g_list_first(packages); - while (iter) { - package = iter->data; - place_cache_domain(package); - iter = g_list_next(iter); + sort_irq_list(&rebalance_irq_list); + if (g_list_length(rebalance_irq_list) > 0) { + for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); + for_each_object(numa_nodes, place_irq_in_object, NULL); + for_each_object(packages, place_irq_in_object, NULL); + for_each_object(cache_domains, place_irq_in_object, NULL); } - - iter = g_list_first(cache_domains); - while (iter) { - cache_domain = iter->data; - place_core(cache_domain); - iter = g_list_next(iter); - } - /* - * if affinity_hint is populated on irq and is not set to - * all CPUs (meaning it's initialized), honor that above - * anything in the package locality/workload. - */ - place_affinity_hint(interrupts); + if (debug_mode) + validate_object_tree_placement(); } diff --git a/powermode.c b/powermode.c index 809cae8..82ba490 100644 --- a/powermode.c +++ b/powermode.c @@ -28,54 +28,7 @@ #include "irqbalance.h" -extern int power_mode; - -static uint64_t previous; - -static unsigned int hysteresis; - void check_power_mode(void) { - FILE *file; - char *line = NULL; - size_t size = 0; - char *c; - uint64_t dummy __attribute__((unused)); - uint64_t irq, softirq; - file = fopen("/proc/stat", "r"); - if (!file) - return; - if (getline(&line, &size, file)==0) - size=0; - fclose(file); - if (!line) - return; - c=&line[4]; - dummy = strtoull(c, &c, 10); /* user */ - dummy = strtoull(c, &c, 10); /* nice */ - dummy = strtoull(c, &c, 10); /* system */ - dummy = strtoull(c, &c, 10); /* idle */ - dummy = strtoull(c, &c, 10); /* iowait */ - irq = strtoull(c, &c, 10); /* irq */ - softirq = strtoull(c, &c, 10); /* softirq */ - - - irq += softirq; - printf("IRQ delta is %lu \n", (unsigned long)(irq - previous) ); - if (irq - previous < POWER_MODE_SOFTIRQ_THRESHOLD) { - hysteresis++; - if (hysteresis > POWER_MODE_HYSTERESIS) { - if (debug_mode && !power_mode) - printf("IRQ delta is %lu, switching to power mode \n", (unsigned long)(irq - previous) ); - power_mode = 1; - } - } else { - if (debug_mode && power_mode) - printf("IRQ delta is %lu, switching to performance mode \n", (unsigned long)(irq - previous) ); - power_mode = 0; - hysteresis = 0; - } - previous = irq; - free(line); } diff --git a/procinterrupts.c b/procinterrupts.c index cd76903..ebaea5e 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "cpumask.h" #include "irqbalance.h" @@ -39,7 +40,6 @@ void parse_proc_interrupts(void) FILE *file; char *line = NULL; size_t size = 0; - int int_type; file = fopen("/proc/interrupts", "r"); if (!file) @@ -48,6 +48,7 @@ void parse_proc_interrupts(void) /* first line is the header we don't need; nuke it */ if (getline(&line, &size, file)==0) { free(line); + fclose(file); return; } @@ -56,6 +57,7 @@ void parse_proc_interrupts(void) int number; uint64_t count; char *c, *c2; + struct irq_info *info; if (getline(&line, &size, file)==0) break; @@ -65,7 +67,11 @@ void parse_proc_interrupts(void) proc_int_has_msi = 1; /* lines with letters in front are special, like NMI count. Ignore */ - if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9'))) + c = line; + while (isblank(*(c))) + c++; + + if (!(*c>='0' && *c<='9')) break; c = strchr(line, ':'); if (!c) @@ -73,6 +79,10 @@ void parse_proc_interrupts(void) *c = 0; c++; number = strtoul(line, NULL, 10); + info = get_irq_info(number); + if (!info) + info = add_misc_irq(number); + count = 0; cpunr = 0; @@ -88,18 +98,13 @@ void parse_proc_interrupts(void) } if (cpunr != core_count) need_cpu_rescan = 1; - - set_interrupt_count(number, count); + + info->last_irq_count = info->irq_count; + info->irq_count = count; /* is interrupt MSI based? */ - int_type = find_irq_integer_prop(number, IRQ_TYPE); - if ((int_type == IRQ_TYPE_MSI) || (int_type == IRQ_TYPE_MSIX)) { + if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX)) msi_found_in_sysfs = 1; - /* Set numa node for irq if it was MSI */ - if (debug_mode) - printf("Set MSI interrupt for %d\n", number); - set_msi_interrupt_numa(number); - } } if ((proc_int_has_msi) && (!msi_found_in_sysfs)) { syslog(LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n"); @@ -113,3 +118,138 @@ void parse_proc_interrupts(void) fclose(file); free(line); } + + +static void accumulate_irq_count(struct irq_info *info, void *data) +{ + uint64_t *acc = data; + + *acc += (info->irq_count - info->last_irq_count); +} + +static void assign_load_slice(struct irq_info *info, void *data) +{ + uint64_t *load_slice = data; + info->load = (info->irq_count - info->last_irq_count) * *load_slice; + + /* + * Every IRQ has at least a load of 1 + */ + if (!info->load) + info->load++; +} + +/* + * Recursive helper to estimate the number of irqs shared between + * multiple topology objects that was handled by this particular object + */ +static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d) +{ + uint64_t total_irq_count = 0; + + if (d->parent) { + total_irq_count = get_parent_branch_irq_count_share(d->parent); + total_irq_count /= g_list_length(*d->obj_type_list); + } + + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_count); + + return total_irq_count; +} + +static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused))) +{ + uint64_t local_irq_counts = 0; + uint64_t load_slice; + int load_divisor = g_list_length(d->children); + + d->load /= (load_divisor ? load_divisor : 1); + + if (g_list_length(d->interrupts) > 0) { + local_irq_counts = get_parent_branch_irq_count_share(d); + load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1; + for_each_irq(d->interrupts, assign_load_slice, &load_slice); + } + + if (d->parent) + d->parent->load += d->load; +} + +void parse_proc_stat() +{ + FILE *file; + char *line = NULL; + size_t size = 0; + int cpunr, rc, cpucount; + struct topo_obj *cpu; + int irq_load, softirq_load; + + file = fopen("/proc/stat", "r"); + if (!file) { + syslog(LOG_WARNING, "WARNING cant open /proc/stat. balacing is broken\n"); + return; + } + + /* first line is the header we don't need; nuke it */ + if (getline(&line, &size, file)==0) { + free(line); + syslog(LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n"); + fclose(file); + return; + } + + cpucount = 0; + while (!feof(file)) { + if (getline(&line, &size, file)==0) + break; + + if (!strstr(line, "cpu")) + break; + + cpunr = strtoul(&line[3], NULL, 10); + + rc = sscanf(line, "%*s %*d %*d %*d %*d %*d %d %d", &irq_load, &softirq_load); + if (rc < 2) + break; + + cpu = find_cpu_core(cpunr); + + if (!cpu) + break; + + cpucount++; + + /* + * For each cpu add the irq and softirq load and propagate that + * all the way up the device tree + */ + if (cycle_count) { + cpu->load = (irq_load + softirq_load) - (cpu->last_load); + /* + * the [soft]irq_load values are in jiffies, which are + * units of 10ms, multiply by 1000 to convert that to + * 1/10 milliseconds. This give us a better integer + * distribution of load between irqs + */ + cpu->load *= 1000; + } + cpu->last_load = (irq_load + softirq_load); + } + + fclose(file); + if (cpucount != get_cpu_count()) { + syslog(LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n"); + return; + } + + /* + * Now that we have load for each cpu attribute a fair share of the load + * to each irq on that cpu + */ + for_each_object(cpus, compute_irq_branch_load_share, NULL); + for_each_object(cache_domains, compute_irq_branch_load_share, NULL); + for_each_object(packages, compute_irq_branch_load_share, NULL); + for_each_object(numa_nodes, compute_irq_branch_load_share, NULL); + +} diff --git a/types.h b/types.h index dc166e1..e9a2fcd 100644 --- a/types.h +++ b/types.h @@ -26,89 +26,39 @@ #define IRQ_TYPE_MSI 1 #define IRQ_TYPE_MSIX 2 - -/* - * IRQ properties - */ -enum irq_prop { - IRQ_CLASS = 0, - IRQ_TYPE, - IRQ_NUMA, - IRQ_LCPU_MASK, - IRQ_MAX_PROPERTY +enum obj_type_e { + OBJ_TYPE_CPU, + OBJ_TYPE_CACHE, + OBJ_TYPE_PACKAGE, + OBJ_TYPE_NODE }; -struct package { - uint64_t workload; - int number; - - cpumask_t mask; - int node_num; - - int class_count[7]; - - GList *cache_domains; - GList *interrupts; +struct topo_obj { + uint64_t load; + uint64_t last_load; + enum obj_type_e obj_type; + int number; + int powersave_mode; + cpumask_t mask; + GList *interrupts; + struct topo_obj *parent; + GList *children; + GList **obj_type_list; }; -struct cache_domain { - uint64_t workload; - int number; - - int marker; - int node_num; - - cpumask_t mask; - - cpumask_t package_mask; - - int class_count[7]; - - GList *cpu_cores; - GList *interrupts; +struct irq_info { + int irq; + int class; + int type; + int level; + struct topo_obj *numa_node; + cpumask_t cpumask; + cpumask_t affinity_hint; + uint64_t irq_count; + uint64_t last_irq_count; + uint64_t load; + int moved; + struct topo_obj *assigned_obj; }; - -struct cpu_core { - uint64_t workload; - int number; - - int marker; - int node_num; - - int class_count[7]; - - cpumask_t package_mask; - cpumask_t cache_mask; - cpumask_t mask; - - GList *interrupts; -}; - -struct interrupt { - uint64_t workload; - - int balance_level; - - int number; - int class; - int node_num; - int msi; - - uint64_t count; - uint64_t old_count; - uint64_t extra; - - cpumask_t mask; - cpumask_t old_mask; - - - cpumask_t numa_mask; - cpumask_t allowed_mask; - - /* user/driver provided for smarter balancing */ - cpumask_t node_mask; -}; - - #endif