Merge branch 'new_balancer'

Conflicts:
	classify.c
This commit is contained in:
Neil Horman 2011-10-12 13:27:10 -04:00
commit 430c88e502
14 changed files with 1081 additions and 1072 deletions

View file

@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies
EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4
INCLUDES = -I${top_srcdir}
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma -lm
AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE
noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \
types.h

View file

@ -32,30 +32,40 @@
#include "irqbalance.h"
void activate_mapping(void)
static void activate_mapping(struct irq_info *info, void *data __attribute__((unused)))
{
struct interrupt *irq;
GList *iter;
char buf[PATH_MAX];
FILE *file;
cpumask_t applied_mask;
iter = g_list_first(interrupts);
while (iter) {
irq = iter->data;
iter = g_list_next(iter);
/*
* only activate mappings for irqs that have moved
*/
if (!info->moved)
return;
/* don't set the level if it's a NONE irq, or if there is
* no change */
if (irq->balance_level != BALANCE_NONE &&
!cpus_equal(irq->mask, irq->old_mask)) {
char buf[PATH_MAX];
FILE *file;
sprintf(buf, "/proc/irq/%i/smp_affinity", irq->number);
file = fopen(buf, "w");
if (!file)
continue;
cpumask_scnprintf(buf, PATH_MAX, irq->mask);
fprintf(file,"%s", buf);
fclose(file);
irq->old_mask = irq->mask;
}
}
if (!info->assigned_obj)
return;
sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq);
file = fopen(buf, "w");
if (!file)
return;
if ((hint_policy == HINT_POLICY_EXACT) &&
(!cpus_empty(info->affinity_hint)))
applied_mask = info->affinity_hint;
else
applied_mask = info->assigned_obj->mask;
cpumask_scnprintf(buf, PATH_MAX, applied_mask);
fprintf(file, "%s", buf);
fclose(file);
info->moved = 0; /*migration is done*/
}
void activate_mappings(void)
{
for_each_irq(NULL, activate_mapping, NULL);
}

View file

@ -25,8 +25,6 @@ int map_class_to_level[7] =
{ BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CACHE, BALANCE_NONE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE };
int class_counts[7];
#define MAX_CLASS 0x12
/*
* Class codes lifted from pci spec, appendix D.
@ -56,35 +54,6 @@ static short class_codes[MAX_CLASS] = {
static GList *interrupts_db;
#define SYSDEV_DIR "/sys/bus/pci/devices"
union property {
int int_val;
cpumask_t mask_val;
};
enum irq_type {
INT_TYPE = 0,
CPUMASK_TYPE,
};
struct irq_property {
enum irq_type itype;
union property iproperty;
};
#define iint_val iproperty.int_val
#define imask_val iproperty.mask_val
struct irq_info {
int irq;
struct irq_property property[IRQ_MAX_PROPERTY];
};
static void init_new_irq(struct irq_info *new)
{
new->property[IRQ_CLASS].itype = INT_TYPE;
new->property[IRQ_TYPE].itype = INT_TYPE;
new->property[IRQ_NUMA].itype = INT_TYPE;
new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE;
}
static gint compare_ints(gconstpointer a, gconstpointer b)
{
@ -94,11 +63,6 @@ static gint compare_ints(gconstpointer a, gconstpointer b)
return ai->irq - bi->irq;
}
static void free_int(gpointer data)
{
free(data);
}
/*
* Inserts an irq_info struct into the intterupts_db list
* devpath points to the device directory in sysfs for the
@ -126,13 +90,12 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
return NULL;
}
new = malloc(sizeof(struct irq_info));
new = calloc(sizeof(struct irq_info), 1);
if (!new)
return NULL;
init_new_irq(new);
new->irq = irq;
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
new->class = IRQ_OTHER;
interrupts_db = g_list_append(interrupts_db, new);
@ -159,7 +122,9 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
if (class >= MAX_CLASS)
goto get_numa_node;
new->property[IRQ_CLASS].iint_val = class_codes[class];
new->class = class_codes[class];
new->level = map_class_to_level[class_codes[class]];
get_numa_node:
numa_node = -1;
sprintf(path, "%s/numa_node", devpath);
@ -171,24 +136,39 @@ get_numa_node:
fclose(fd);
assign_node:
new->property[IRQ_NUMA].iint_val = numa_node;
new->numa_node = get_numa_node(numa_node);
sprintf(path, "%s/local_cpus", devpath);
fd = fopen(path, "r");
if (!fd) {
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
goto out;
cpus_setall(new->cpumask);
goto assign_affinity_hint;
}
lcpu_mask = NULL;
rc = fscanf(fd, "%as", &lcpu_mask);
fclose(fd);
if (!lcpu_mask) {
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
if (!lcpu_mask || !rc) {
cpus_setall(new->cpumask);
} else {
cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
new->property[IRQ_LCPU_MASK].imask_val);
free(lcpu_mask);
new->cpumask);
}
free(lcpu_mask);
assign_affinity_hint:
cpus_clear(new->affinity_hint);
sprintf(path, "/proc/irq/%d/affinity_hint", irq);
fd = fopen(path, "r");
if (!fd)
goto out;
lcpu_mask = NULL;
rc = fscanf(fd, "%as", &lcpu_mask);
fclose(fd);
if (!lcpu_mask)
goto out;
cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
new->affinity_hint);
free(lcpu_mask);
out:
if (debug_mode)
printf("Adding IRQ %d to database\n", irq);
@ -226,7 +206,7 @@ static void build_one_dev_entry(const char *dirname)
new = add_one_irq_to_db(path, irqnum);
if (!new)
continue;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX;
new->type = IRQ_TYPE_MSIX;
}
} while (entry != NULL);
closedir(msidir);
@ -248,20 +228,32 @@ static void build_one_dev_entry(const char *dirname)
new = add_one_irq_to_db(path, irqnum);
if (!new)
goto done;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
new->type = IRQ_TYPE_LEGACY;
}
done:
fclose(fd);
return;
}
static void free_irq(struct irq_info *info, void *data __attribute__((unused)))
{
free(info);
}
void free_irq_db(void)
{
for_each_irq(NULL, free_irq, NULL);
g_list_free(interrupts_db);
interrupts_db = NULL;
}
void rebuild_irq_db(void)
{
DIR *devdir = opendir(SYSDEV_DIR);
struct dirent *entry;
g_list_free_full(interrupts_db, free_int);
free_irq_db();
if (!devdir)
return;
@ -278,83 +270,80 @@ void rebuild_irq_db(void)
closedir(devdir);
}
static GList *add_misc_irq(int irq)
struct irq_info *add_misc_irq(int irq)
{
struct irq_info *new, find;
struct irq_info *new;
new = malloc(sizeof(struct irq_info));
new = calloc(sizeof(struct irq_info), 1);
if (!new)
return NULL;
init_new_irq(new);
new->irq = irq;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
new->property[IRQ_NUMA].iint_val = -1;
new->type = IRQ_TYPE_LEGACY;
new->class = IRQ_OTHER;
new->numa_node = get_numa_node(0);
interrupts_db = g_list_append(interrupts_db, new);
find.irq = irq;
return g_list_find_custom(interrupts_db, &find, compare_ints);
return new;
}
int find_irq_integer_prop(int irq, enum irq_prop prop)
void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data)
{
GList *entry;
struct irq_info find, *result;
find.irq = irq;
GList *entry = g_list_first(list ? list : interrupts_db);
GList *next;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry) {
if (debug_mode)
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
entry = add_misc_irq(irq);
while (entry) {
next = g_list_next(entry);
cb(entry->data, data);
entry = next;
}
result = entry->data;
assert(result->property[prop].itype == INT_TYPE);
return result->property[prop].iint_val;
}
cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop)
struct irq_info *get_irq_info(int irq)
{
GList *entry;
struct irq_info find, *result;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry) {
if (debug_mode)
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
entry = add_misc_irq(irq);
}
result = entry->data;
assert(result->property[prop].itype == CPUMASK_TYPE);
return result->property[prop].imask_val;
}
int get_next_irq(int irq)
{
GList *entry;
struct irq_info *irqp, find;
if (irq == -1) {
entry = g_list_first(interrupts_db);
irqp = entry->data;
return irqp->irq;
}
struct irq_info find;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry)
return entry ? entry->data : NULL;
}
void migrate_irq(GList **from, GList **to, struct irq_info *info)
{
GList *entry;
struct irq_info find, *tmp;;
find.irq = info->irq;
entry = g_list_find_custom(*from, &find, compare_ints);
tmp = entry->data;
*from = g_list_delete_link(*from, entry);
*to = g_list_append(*to, tmp);
info->moved = 1;
}
static gint sort_irqs(gconstpointer A, gconstpointer B)
{
struct irq_info *a, *b;
a = (struct irq_info*)A;
b = (struct irq_info*)B;
if (a->class < b->class)
return 1;
if (a->class > b->class)
return -1;
entry = g_list_next(entry);
if (!entry)
if (a->load < b->load)
return 1;
if (a->load > b->load)
return -1;
irqp= entry->data;
return irqp->irq;
if (a<b)
return 1;
return -1;
}
void sort_irq_list(GList **list)
{
*list = g_list_sort(*list, sort_irqs);
}

View file

@ -51,10 +51,11 @@ AC_PROG_AWK
echo .
echo Checking for header files
AC_HEADER_STDC
AC_CHECK_HEADERS(linux/ethtool.h linux/sockios.h, [], [])
AC_CHECK_FUNCS(getopt_long)
AC_CHECK_LIB(numa, numa_run_on_node, [], [])
AC_CHECK_LIB(m, floor, [], [])
AC_C_CONST
AC_C_INLINE

366
cputree.c
View file

@ -55,122 +55,103 @@ cpumask_t cpu_possible_map;
*/
static cpumask_t unbanned_cpus;
static int search_numa_node(cpumask_t mask)
{
int node_num, ret;
struct bitmask *node_mask;
cpumask_t cpu_node_mask;
node_num = numa_num_configured_nodes();
if (node_num < 1)
return -1;
node_mask = numa_allocate_cpumask();
node_num--; /* indexing from zero */
while (node_num >= 0) {
ret = numa_node_to_cpus(node_num, node_mask);
if (ret) {
node_num--;
continue;
}
memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long));
if (cpus_intersects(mask, cpu_node_mask)) {
numa_free_cpumask(node_mask);
return node_num;
}
node_num--;
}
numa_free_cpumask(node_mask);
return node_num;
}
static void fill_packages(void)
static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache,
cpumask_t package_mask)
{
GList *entry;
struct topo_obj *package;
struct topo_obj *lcache;
entry = g_list_first(packages);
entry = g_list_first(cache_domains);
while (entry) {
struct package *package;
struct cache_domain *cache = NULL;
GList *entry2;
cache = entry->data;
entry2 = entry;
entry = g_list_next(entry);
if (cache->marker)
continue;
package = malloc(sizeof(struct package));
if (!package)
package = entry->data;
if (cpus_equal(package_mask, package->mask))
break;
memset(package, 0, sizeof(struct package));
package->mask = cache->package_mask;
package->number = cache->number;
package->node_num = search_numa_node(package->mask);
while (entry2) {
struct cache_domain *cache2;
cache2 = entry2->data;
if (cpus_equal(cache->package_mask, cache2->package_mask)) {
cache2->marker = 1;
package->cache_domains = g_list_append(package->cache_domains, cache2);
if (package->number > cache2->number)
package->number = cache2->number;
}
entry2 = g_list_next(entry2);
}
entry = g_list_next(entry);
}
if (!entry) {
package = calloc(sizeof(struct topo_obj), 1);
if (!package)
return NULL;
package->mask = package_mask;
package->obj_type = OBJ_TYPE_PACKAGE;
package->obj_type_list = &packages;
packages = g_list_append(packages, package);
package_count++;
}
}
static void fill_cache_domain(void)
entry = g_list_first(package->children);
while (entry) {
lcache = entry->data;
if (lcache == cache)
break;
entry = g_list_next(entry);
}
if (!entry) {
package->children = g_list_append(package->children, cache);
cache->parent = package;
}
return package;
}
static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu,
cpumask_t cache_mask)
{
GList *entry;
struct topo_obj *cache;
struct topo_obj *lcpu;
entry = g_list_first(cache_domains);
entry = g_list_first(cpus);
while (entry) {
struct cache_domain *cache = NULL;
struct cpu_core *cpu;
GList *entry2;
cpu = entry->data;
entry2 = entry;
entry = g_list_next(entry);
if (cpu->marker)
continue;
cache = malloc(sizeof(struct cache_domain));
if (!cache)
cache = entry->data;
if (cpus_equal(cache_mask, cache->mask))
break;
memset(cache, 0, sizeof(struct cache_domain));
cache->mask = cpu->cache_mask;
cache->package_mask = cpu->package_mask;
cache->number = cpu->number;
cache->node_num = search_numa_node(cache->mask);
entry = g_list_next(entry);
}
if (!entry) {
cache = calloc(sizeof(struct topo_obj), 1);
if (!cache)
return NULL;
cache->obj_type = OBJ_TYPE_CACHE;
cache->mask = cache_mask;
cache->number = cache_domain_count;
cache->obj_type_list = &cache_domains;
cache_domains = g_list_append(cache_domains, cache);
cache_domain_count++;
while (entry2) {
struct cpu_core *cpu2;
cpu2 = entry2->data;
if (cpus_equal(cpu->cache_mask, cpu2->cache_mask) &&
cpus_equal(cpu->package_mask, cpu2->package_mask)) {
cpu2->marker = 1;
cache->cpu_cores = g_list_append(cache->cpu_cores, cpu2);
if (cpu2->number < cache->number)
cache->number = cpu2->number;
}
entry2 = g_list_next(entry2);
}
}
entry = g_list_first(cache->children);
while (entry) {
lcpu = entry->data;
if (lcpu == cpu)
break;
entry = g_list_next(entry);
}
if (!entry) {
cache->children = g_list_append(cache->children, cpu);
cpu->parent = (struct topo_obj *)cache;
}
return cache;
}
static void do_one_cpu(char *path)
{
struct cpu_core *cpu;
struct topo_obj *cpu;
FILE *file;
char new_path[PATH_MAX];
cpumask_t cache_mask, package_mask;
struct topo_obj *cache;
struct topo_obj *package;
DIR *dir;
struct dirent *entry;
int nodeid;
/* skip offline cpus */
snprintf(new_path, PATH_MAX, "%s/online", path);
@ -188,10 +169,11 @@ static void do_one_cpu(char *path)
free(line);
}
cpu = malloc(sizeof(struct cpu_core));
cpu = calloc(sizeof(struct topo_obj), 1);
if (!cpu)
return;
memset(cpu, 0, sizeof(struct cpu_core));
cpu->obj_type = OBJ_TYPE_CPU;
cpu->number = strtoul(&path[27], NULL, 10);
@ -199,9 +181,6 @@ static void do_one_cpu(char *path)
cpu_set(cpu->number, cpu->mask);
/* set numa node of cpu */
cpu->node_num = search_numa_node(cpu->mask);
/* if the cpu is on the banned list, just don't add it */
if (cpus_intersects(cpu->mask, banned_cpus)) {
free(cpu);
@ -214,26 +193,26 @@ static void do_one_cpu(char *path)
/* try to read the package mask; if it doesn't exist assume solitary */
snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path);
file = fopen(new_path, "r");
cpu_set(cpu->number, cpu->package_mask);
cpu_set(cpu->number, package_mask);
if (file) {
char *line = NULL;
size_t size = 0;
if (getline(&line, &size, file))
cpumask_parse_user(line, strlen(line), cpu->package_mask);
cpumask_parse_user(line, strlen(line), package_mask);
fclose(file);
free(line);
}
/* try to read the cache mask; if it doesn't exist assume solitary */
/* We want the deepest cache level available so try index1 first, then index2 */
cpu_set(cpu->number, cpu->cache_mask);
cpu_set(cpu->number, cache_mask);
snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path);
file = fopen(new_path, "r");
if (file) {
char *line = NULL;
size_t size = 0;
if (getline(&line, &size, file))
cpumask_parse_user(line, strlen(line), cpu->cache_mask);
cpumask_parse_user(line, strlen(line), cache_mask);
fclose(file);
free(line);
}
@ -243,66 +222,94 @@ static void do_one_cpu(char *path)
char *line = NULL;
size_t size = 0;
if (getline(&line, &size, file))
cpumask_parse_user(line, strlen(line), cpu->cache_mask);
cpumask_parse_user(line, strlen(line), cache_mask);
fclose(file);
free(line);
}
nodeid=0;
dir = opendir(path);
do {
entry = readdir(dir);
if (!entry)
break;
if (strstr(entry->d_name, "node")) {
nodeid = strtoul(&entry->d_name[4], NULL, 10);
break;
}
} while (entry);
closedir(dir);
cache = add_cpu_to_cache_domain(cpu, cache_mask);
package = add_cache_domain_to_package(cache, package_mask);
add_package_to_node(package, nodeid);
/*
blank out the banned cpus from the various masks so that interrupts
will never be told to go there
*/
cpus_and(cpu->cache_mask, cpu->cache_mask, unbanned_cpus);
cpus_and(cpu->package_mask, cpu->package_mask, unbanned_cpus);
cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus);
cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus);
cpus_and(cpu->mask, cpu->mask, unbanned_cpus);
cpu->obj_type_list = &cpus;
cpus = g_list_append(cpus, cpu);
core_count++;
}
static void dump_irqs(int spaces, GList *dump_interrupts)
static void dump_irq(struct irq_info *info, void *data)
{
struct interrupt *irq;
while (dump_interrupts) {
int i;
for (i=0; i<spaces; i++) printf(" ");
irq = dump_interrupts->data;
printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload);
dump_interrupts = g_list_next(dump_interrupts);
}
int spaces = (long int)data;
int i;
for (i=0; i<spaces; i++) printf(" ");
printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load);
}
static void dump_topo_obj(struct topo_obj *d, void *data __attribute__((unused)))
{
struct topo_obj *c = (struct topo_obj *)d;
printf(" CPU number %i numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load);
if (c->interrupts)
for_each_irq(c->interrupts, dump_irq, (void *)18);
}
static void dump_cache_domain(struct topo_obj *d, void *data)
{
char *buffer = data;
cpumask_scnprintf(buffer, 4095, d->mask);
printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load);
if (d->children)
for_each_object(d->children, dump_topo_obj, NULL);
if (d->interrupts)
for_each_irq(d->interrupts, dump_irq, (void *)10);
}
static void dump_package(struct topo_obj *d, void *data)
{
char *buffer = data;
cpumask_scnprintf(buffer, 4096, d->mask);
printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load);
if (d->children)
for_each_object(d->children, dump_cache_domain, buffer);
if (d->interrupts)
for_each_irq(d->interrupts, dump_irq, (void *)2);
}
void dump_tree(void)
{
GList *p_iter, *c_iter, *cp_iter;
struct package *package;
struct cache_domain *cache_domain;
struct cpu_core *cpu;
char buffer[4096];
p_iter = g_list_first(packages);
while (p_iter) {
package = p_iter->data;
cpumask_scnprintf(buffer, 4096, package->mask);
printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload);
c_iter = g_list_first(package->cache_domains);
while (c_iter) {
cache_domain = c_iter->data;
c_iter = g_list_next(c_iter);
cpumask_scnprintf(buffer, 4095, cache_domain->mask);
printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload);
cp_iter = cache_domain->cpu_cores;
while (cp_iter) {
cpu = cp_iter->data;
cp_iter = g_list_next(cp_iter);
printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload);
dump_irqs(18, cpu->interrupts);
}
dump_irqs(10, cache_domain->interrupts);
}
dump_irqs(2, package->interrupts);
p_iter = g_list_next(p_iter);
}
for_each_object(packages, dump_package, buffer);
}
static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused)))
{
info->load = 0;
}
static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused)))
{
for_each_object(d->children, clear_obj_stats, NULL);
for_each_irq(d->interrupts, clear_irq_stats, NULL);
}
/*
@ -310,40 +317,9 @@ void dump_tree(void)
* which level does how much work and the actual lists of interrupts
* assigned to each component
*/
void clear_work_stats(void)
void clear_work_stats()
{
GList *p_iter, *c_iter, *cp_iter;
struct package *package;
struct cache_domain *cache_domain;
struct cpu_core *cpu;
p_iter = g_list_first(packages);
while (p_iter) {
package = p_iter->data;
package->workload = 0;
g_list_free(package->interrupts);
package->interrupts = NULL;
c_iter = g_list_first(package->cache_domains);
memset(package->class_count, 0, sizeof(package->class_count));
while (c_iter) {
cache_domain = c_iter->data;
c_iter = g_list_next(c_iter);
cache_domain->workload = 0;
cp_iter = cache_domain->cpu_cores;
g_list_free(cache_domain->interrupts);
cache_domain->interrupts = NULL;
memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count));
while (cp_iter) {
cpu = cp_iter->data;
cp_iter = g_list_next(cp_iter);
cpu->workload = 0;
g_list_free(cpu->interrupts);
cpu->interrupts = NULL;
memset(cpu->class_count, 0, sizeof(cpu->class_count));
}
}
p_iter = g_list_next(p_iter);
}
for_each_object(numa_nodes, clear_obj_stats, NULL);
}
@ -373,9 +349,6 @@ void parse_cpu_tree(void)
} while (entry);
closedir(dir);
fill_cache_domain();
fill_packages();
if (debug_mode)
dump_tree();
@ -389,14 +362,14 @@ void parse_cpu_tree(void)
void clear_cpu_tree(void)
{
GList *item;
struct cpu_core *cpu;
struct cache_domain *cache_domain;
struct package *package;
struct topo_obj *cpu;
struct topo_obj *cache_domain;
struct topo_obj *package;
while (packages) {
item = g_list_first(packages);
package = item->data;
g_list_free(package->cache_domains);
g_list_free(package->children);
g_list_free(package->interrupts);
free(package);
packages = g_list_delete_link(packages, item);
@ -406,7 +379,7 @@ void clear_cpu_tree(void)
while (cache_domains) {
item = g_list_first(cache_domains);
cache_domain = item->data;
g_list_free(cache_domain->cpu_cores);
g_list_free(cache_domain->children);
g_list_free(cache_domain->interrupts);
free(cache_domain);
cache_domains = g_list_delete_link(cache_domains, item);
@ -424,3 +397,28 @@ void clear_cpu_tree(void)
core_count = 0;
}
static gint compare_cpus(gconstpointer a, gconstpointer b)
{
const struct topo_obj *ai = a;
const struct topo_obj *bi = b;
return ai->number - bi->number;
}
struct topo_obj *find_cpu_core(int cpunr)
{
GList *entry;
struct topo_obj find;
find.number = cpunr;
entry = g_list_find_custom(cpus, &find, compare_cpus);
return entry ? entry->data : NULL;
}
int get_cpu_count(void)
{
return g_list_length(cpus);
}

View file

@ -41,6 +41,19 @@ Causes irqbalance to be run once, after which the daemon exits
.B --debug
Causes irqbalance to run in the foreground and extra debug information to be printed
.TP
.B --hintpolicy=[exact | subset | ignore]
Set the policy for how irq kernel affinity hinting is treated. Can be one of:
.P
.I exact
irq affinity hint is applied unilaterally and never violated
.P
.I subset
irq is balanced, but the assigned object will be a subset of the affintiy hint
.P
.I ignore
irq affinity hint value is completely ignored
.SH "ENVIRONMENT VARIABLES"
.TP
.B IRQBALANCE_ONESHOT

View file

@ -38,13 +38,11 @@
int one_shot_mode;
int debug_mode;
int numa_avail;
int need_cpu_rescan;
extern cpumask_t banned_cpus;
static int counter;
enum hp_e hint_policy = HINT_POLICY_SUBSET;
unsigned long power_thresh = ULONG_MAX;
unsigned long long cycle_count = 0;
void sleep_approx(int seconds)
{
@ -64,12 +62,15 @@ void sleep_approx(int seconds)
struct option lopts[] = {
{"oneshot", 0, NULL, 'o'},
{"debug", 0, NULL, 'd'},
{"hintpolicy", 1, NULL, 'h'},
{"powerthresh", 1, NULL, 'p'},
{0, 0, 0, 0}
};
static void usage(void)
{
printf("irqbalance [--oneshot | -o] [--debug | -d]");
printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]\n");
printf(" [--powerthresh= | -p <off> | <n>]\n");
}
static void parse_command_line(int argc, char **argv)
@ -78,7 +79,7 @@ static void parse_command_line(int argc, char **argv)
int longind;
while ((opt = getopt_long(argc, argv,
"",
"odh:p:",
lopts, &longind)) != -1) {
switch(opt) {
@ -88,6 +89,29 @@ static void parse_command_line(int argc, char **argv)
case 'd':
debug_mode=1;
break;
case 'h':
if (!strncmp(optarg, "exact", strlen(optarg)))
hint_policy = HINT_POLICY_EXACT;
else if (!strncmp(optarg, "subset", strlen(optarg)))
hint_policy = HINT_POLICY_SUBSET;
else if (!strncmp(optarg, "ignore", strlen(optarg)))
hint_policy = HINT_POLICY_IGNORE;
else {
usage();
exit(1);
}
break;
case 'p':
if (!strncmp(optarg, "off", strlen(optarg)))
power_thresh = ULONG_MAX;
else {
power_thresh = strtoull(optarg, NULL, 10);
if (power_thresh == ULONG_MAX) {
usage();
exit(1);
}
}
break;
case 'o':
one_shot_mode=1;
break;
@ -96,6 +120,50 @@ static void parse_command_line(int argc, char **argv)
}
#endif
/*
* This builds our object tree. The Heirarchy is pretty straightforward
* At the top are numa_nodes
* All CPU packages belong to a single numa_node
* All Cache domains belong to a CPU package
* All CPU cores belong to a cache domain
*
* Objects are built in that order (top down)
*
* Object workload is the aggregate sum of the
* workload of the objects below it
*/
static void build_object_tree()
{
build_numa_node_list();
parse_cpu_tree();
rebuild_irq_db();
}
static void free_object_tree()
{
free_numa_node_list();
clear_cpu_tree();
free_irq_db();
}
static void dump_object_tree()
{
for_each_object(numa_nodes, dump_numa_node_info, NULL);
}
static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused)))
{
if (info->level == BALANCE_NONE)
return;
if (info->assigned_obj == NULL)
rebalance_irq_list = g_list_append(rebalance_irq_list, info);
else
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
info->assigned_obj = NULL;
}
int main(int argc, char** argv)
{
@ -125,9 +193,9 @@ int main(int argc, char** argv)
}
rebuild_irq_db();
parse_cpu_tree();
build_object_tree();
if (debug_mode)
dump_object_tree();
/* On single core UP systems irqbalance obviously has no work to do */
@ -150,15 +218,10 @@ int main(int argc, char** argv)
capng_apply(CAPNG_SELECT_BOTH);
#endif
for_each_irq(NULL, force_rebalance_irq, NULL);
parse_proc_interrupts();
sleep(SLEEP_INTERVAL/4);
reset_counts();
parse_proc_interrupts();
pci_numa_scan();
calculate_workload();
sort_irq_list();
if (debug_mode)
dump_workloads();
parse_proc_stat();
while (1) {
sleep_approx(SLEEP_INTERVAL);
@ -166,8 +229,9 @@ int main(int argc, char** argv)
printf("\n\n\n-----------------------------------------------------------------------------\n");
check_power_mode();
clear_work_stats();
parse_proc_interrupts();
parse_proc_stat();
/* cope with cpu hotplug -- detected during /proc/interrupts parsing */
if (need_cpu_rescan) {
@ -179,25 +243,31 @@ int main(int argc, char** argv)
reset_counts();
clear_work_stats();
clear_cpu_tree();
parse_cpu_tree();
}
free_object_tree();
build_object_tree();
for_each_irq(NULL, force_rebalance_irq, NULL);
parse_proc_interrupts();
parse_proc_stat();
sleep_approx(SLEEP_INTERVAL);
clear_work_stats();
parse_proc_interrupts();
parse_proc_stat();
cycle_count=0;
}
calculate_workload();
/* to cope with dynamic configurations we scan for new numa information
* once every 5 minutes
*/
pci_numa_scan();
if (cycle_count)
update_migration_status();
calculate_placement();
activate_mapping();
activate_mappings();
if (debug_mode)
dump_tree();
if (one_shot_mode)
break;
counter++;
cycle_count++;
}
free_object_tree();
return EXIT_SUCCESS;
}

View file

@ -8,47 +8,116 @@
#include <stdint.h>
#include <glib.h>
#include <syslog.h>
#include "types.h"
#include <numa.h>
struct interrupt;
extern int package_count;
extern int cache_domain_count;
extern int core_count;
extern char *classes[];
extern int map_class_to_level[7];
extern int class_counts[7];
extern int debug_mode;
extern int power_mode;
extern int need_cpu_rescan;
extern int one_shot_mode;
extern GList *interrupts;
extern void parse_cpu_tree(void);
extern void clear_work_stats(void);
extern void parse_proc_interrupts(void);
extern void rebuild_irq_db(void);
extern void parse_proc_stat(void);
extern void set_interrupt_count(int number, uint64_t count);
extern void set_msi_interrupt_numa(int number);
extern int get_next_irq(int irq);
extern int find_irq_integer_prop(int irq, enum irq_prop prop);
extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop);
extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type);
extern GList *rebalance_irq_list;
void calculate_workload(void);
void update_migration_status(void);
void reset_counts(void);
void dump_workloads(void);
void sort_irq_list(void);
void sort_irq_list(GList **list);
void calculate_placement(void);
void dump_tree(void);
void activate_mapping(void);
void activate_mappings(void);
void account_for_nic_stats(void);
void check_power_mode(void);
void clear_cpu_tree(void);
void pci_numa_scan(void);
/*===================NEW BALANCER FUNCTIONS============================*/
/*
* Master topo_obj type lists
*/
extern GList *numa_nodes;
extern GList *packages;
extern GList *cache_domains;
extern GList *cpus;
enum hp_e {
HINT_POLICY_IGNORE,
HINT_POLICY_SUBSET,
HINT_POLICY_EXACT
};
extern int debug_mode;
extern int one_shot_mode;
extern int power_mode;
extern int need_cpu_rescan;
extern enum hp_e hint_policy;
extern unsigned long long cycle_count;
extern unsigned long power_thresh;
/*
* Numa node access routines
*/
extern void build_numa_node_list(void);
extern void free_numa_node_list(void);
extern void dump_numa_node_info(struct topo_obj *node, void *data);
extern void add_package_to_node(struct topo_obj *p, int nodeid);
extern struct topo_obj *get_numa_node(int nodeid);
/*
* Package functions
*/
#define package_numa_node(p) ((p)->parent)
/*
* cache_domain functions
*/
#define cache_domain_package(c) ((c)->parent)
#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c))))
/*
* cpu core functions
*/
#define cpu_cache_domain(cpu) ((cpu)->parent)
#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu))))
#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu)))))
extern struct topo_obj *find_cpu_core(int cpunr);
extern int get_cpu_count(void);
/*
* irq db functions
*/
extern void rebuild_irq_db(void);
extern void free_irq_db(void);
extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data);
extern struct irq_info *get_irq_info(int irq);
extern void migrate_irq(GList **from, GList **to, struct irq_info *info);
extern struct irq_info *add_misc_irq(int irq);
#define irq_numa_node(irq) ((irq)->numa_node)
/*
* Generic object functions
*/
static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data)
{
GList *entry, *next;
entry = g_list_first(list);
while (entry) {
next = g_list_next(entry);
cb(entry->data, data);
entry = next;
}
}
#endif

380
irqlist.c
View file

@ -29,285 +29,183 @@
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <math.h>
#include "types.h"
#include "irqbalance.h"
GList *interrupts;
struct load_balance_info {
unsigned long long int total_load;
unsigned long long avg_load;
int load_sources;
unsigned long long int deviations;
long double std_deviation;
unsigned int num_within;
unsigned int num_over;
unsigned int num_under;
struct topo_obj *powersave;
};
void get_affinity_hint(struct interrupt *irq, int number)
static void gather_load_stats(struct topo_obj *obj, void *data)
{
char buf[PATH_MAX];
cpumask_t tempmask;
char *line = NULL;
size_t size = 0;
FILE *file;
sprintf(buf, "/proc/irq/%i/affinity_hint", number);
file = fopen(buf, "r");
if (!file)
return;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
return;
}
cpumask_parse_user(line, strlen(line), tempmask);
if (!__cpus_full(&tempmask, num_possible_cpus()))
irq->node_mask = tempmask;
fclose(file);
free(line);
struct load_balance_info *info = data;
info->total_load += obj->load;
info->load_sources += 1;
}
/*
* This function classifies and reads various things from /proc about a specific irq
*/
static void investigate(struct interrupt *irq, int number)
static void compute_deviations(struct topo_obj *obj, void *data)
{
DIR *dir;
struct dirent *entry;
char *c, *c2;
int nr , count = 0, can_set = 1;
char buf[PATH_MAX];
sprintf(buf, "/proc/irq/%i", number);
dir = opendir(buf);
do {
entry = readdir(dir);
if (!entry)
break;
if (strcmp(entry->d_name,"smp_affinity")==0) {
char *line = NULL;
size_t size = 0;
FILE *file;
sprintf(buf, "/proc/irq/%i/smp_affinity", number);
file = fopen(buf, "r+");
if (!file)
continue;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
continue;
}
cpumask_parse_user(line, strlen(line), irq->mask);
/*
* Check that we can write the affinity, if
* not take it out of the list.
*/
fputs(line, file);
if (fclose(file) && errno == EIO)
can_set = 0;
free(line);
} else if (strcmp(entry->d_name,"allowed_affinity")==0) {
char *line = NULL;
size_t size = 0;
FILE *file;
sprintf(buf, "/proc/irq/%i/allowed_affinity", number);
file = fopen(buf, "r");
if (!file)
continue;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
continue;
}
cpumask_parse_user(line, strlen(line), irq->allowed_mask);
fclose(file);
free(line);
} else if (strcmp(entry->d_name,"affinity_hint")==0) {
get_affinity_hint(irq, number);
} else {
irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS);
}
struct load_balance_info *info = data;
unsigned long long int deviation;
} while (entry);
closedir(dir);
irq->balance_level = map_class_to_level[irq->class];
deviation = (obj->load > info->avg_load) ?
obj->load - info->avg_load :
info->avg_load - obj->load;
for (nr = 0; nr < NR_CPUS; nr++)
if (cpu_isset(nr, irq->allowed_mask))
count++;
/* if there is no choice in the allowed mask, don't bother to balance */
if ((count<2) || (can_set == 0))
irq->balance_level = BALANCE_NONE;
/* next, check the IRQBALANCE_BANNED_INTERRUPTS env variable for blacklisted irqs */
c = c2 = getenv("IRQBALANCE_BANNED_INTERRUPTS");
if (!c)
return;
do {
c = c2;
nr = strtoul(c, &c2, 10);
if (c!=c2 && nr == number)
irq->balance_level = BALANCE_NONE;
} while (c!=c2 && c2!=NULL);
info->deviations += (deviation * deviation);
}
/* Set numa node number for MSI interrupt;
* Assumes existing irq metadata
*/
void set_msi_interrupt_numa(int number)
static void move_candidate_irqs(struct irq_info *info, void *data)
{
GList *item;
struct interrupt *irq;
int node;
int *remaining_deviation = (int *)data;
node = find_irq_integer_prop(number, IRQ_NUMA);
if (node < 0)
return;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
if (irq->number == number) {
irq->node_num = node;
irq->msi = 1;
/* never move an irq that has an afinity hint when
* hint_policy is HINT_POLICY_EXACT
*/
if (hint_policy == HINT_POLICY_EXACT)
if (!cpus_empty(info->affinity_hint))
return;
}
item = g_list_next(item);
}
}
/*
* Set the number of interrupts received for a specific irq;
* create the irq metadata if there is none yet
*/
void set_interrupt_count(int number, uint64_t count)
{
GList *item;
struct interrupt *irq;
if (count < MIN_IRQ_COUNT && !one_shot_mode)
return; /* no need to track or set interrupts sources without any activity since boot
but allow for a few (20) boot-time-only interrupts */
item = g_list_first(interrupts);
while (item) {
irq = item->data;
if (irq->number == number) {
irq->count = count;
/* see if affinity_hint changed */
get_affinity_hint(irq, number);
return;
}
item = g_list_next(item);
}
/* new interrupt */
irq = malloc(sizeof(struct interrupt));
if (!irq)
/* Don't rebalance irqs that don't want it */
if (info->level == BALANCE_NONE)
return;
memset(irq, 0, sizeof(struct interrupt));
irq->node_num = -1;
irq->number = number;
irq->count = count;
irq->allowed_mask = CPU_MASK_ALL;
investigate(irq, number);
interrupts = g_list_append(interrupts, irq);
/* Don't move cpus that only have one irq, regardless of load */
if (g_list_length(info->assigned_obj->interrupts) <= 1)
return;
/* Stop rebalancing if we've estimated a full reduction of deviation */
if (*remaining_deviation <= 0)
return;
*remaining_deviation -= info->load;
if (debug_mode)
printf("Selecting irq %d for rebalancing\n", info->irq);
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
info->assigned_obj = NULL;
}
/*
* Set the numa affinity mask for a specific interrupt if there
* is metadata for the interrupt; do nothing if no such data
* exists.
*/
void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type)
static void migrate_overloaded_irqs(struct topo_obj *obj, void *data)
{
GList *item;
struct interrupt *irq;
struct load_balance_info *info = data;
int deviation;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
item = g_list_next(item);
/*
* Don't rebalance irqs on objects whos load is below the average
*/
if (obj->load <= info->avg_load) {
if ((obj->load + info->std_deviation) <= info->avg_load) {
info->num_under++;
info->powersave = obj;
} else
info->num_within++;
return;
}
if (irq->number == number) {
cpus_or(irq->numa_mask, irq->numa_mask, mask);
irq->node_num = node_num;
if (irq->class < type && irq->balance_level != BALANCE_NONE) {
irq->class = type;
irq->balance_level = map_class_to_level[irq->class];
}
return;
deviation = obj->load - info->avg_load;
if ((deviation > info->std_deviation) &&
(g_list_length(obj->interrupts) > 1)) {
info->num_over++;
/*
* We have a cpu that is overloaded and
* has irqs that can be moved to fix that
*/
/* order the list from least to greatest workload */
sort_irq_list(&obj->interrupts);
/*
* Each irq carries a weighted average amount of load
* we think its responsible for. Set deviation to be the load
* of the difference between this objects load and the averate,
* and migrate irqs until we only have one left, or until that
* difference reaches zero
*/
for_each_irq(obj->interrupts, move_candidate_irqs, &deviation);
} else
info->num_within++;
}
static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused)))
{
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
}
static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused)))
{
obj->powersave_mode = 0;
}
#define find_overloaded_objs(name, info) do {\
int ___load_sources;\
memset(&(info), 0, sizeof(struct load_balance_info));\
for_each_object((name), gather_load_stats, &(info));\
(info).avg_load = (info).total_load / (info).load_sources;\
for_each_object((name), compute_deviations, &(info));\
___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\
(info).std_deviation = (long double)((info).deviations / ___load_sources);\
(info).std_deviation = sqrt((info).std_deviation);\
for_each_object((name), migrate_overloaded_irqs, &(info));\
}while(0)
void update_migration_status(void)
{
struct load_balance_info info;
find_overloaded_objs(cpus, info);
if (cycle_count > 5) {
if (!info.num_over && (info.num_under >= power_thresh)) {
syslog(LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number);
info.powersave->powersave_mode = 1;
for_each_irq(info.powersave->interrupts, force_irq_migration, NULL);
} else if (info.num_over) {
syslog(LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n");
for_each_object(cpus, clear_powersave_mode, NULL);
}
}
find_overloaded_objs(cache_domains, info);
find_overloaded_objs(packages, info);
find_overloaded_objs(numa_nodes, info);
}
void calculate_workload(void)
static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused)))
{
int i;
GList *item;
struct interrupt *irq;
for (i=0; i<7; i++)
class_counts[i]=0;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
item = g_list_next(item);
irq->workload = irq->count - irq->old_count + irq->workload/3 + irq->extra;
class_counts[irq->class]++;
irq->old_count = irq->count;
irq->extra = 0;
}
info->last_irq_count = info->irq_count;
info->irq_count = 0;
}
void reset_counts(void)
{
GList *item;
struct interrupt *irq;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
item = g_list_next(item);
irq->old_count = irq->count;
irq->extra = 0;
for_each_irq(NULL, reset_irq_count, NULL);
}
}
static void dump_workload(struct irq_info *info, void *unused __attribute__((unused)))
{
printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load);
}
void dump_workloads(void)
{
GList *item;
struct interrupt *irq;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
item = g_list_next(item);
printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload);
}
for_each_irq(NULL, dump_workload, NULL);
}
static gint sort_irqs(gconstpointer A, gconstpointer B)
{
struct interrupt *a, *b;
a = (struct interrupt*)A;
b = (struct interrupt*)B;
if (a->class < b->class)
return 1;
if (a->class > b->class)
return -1;
if (a->workload < b->workload)
return 1;
if (a->workload > b->workload)
return -1;
if (a<b)
return 1;
return -1;
}
void sort_irq_list(void)
{
/* sort by class first (high->low) and then by workload (high->low) */
interrupts = g_list_sort(interrupts, sort_irqs);
}

142
numa.c
View file

@ -33,24 +33,130 @@
#include "irqbalance.h"
void pci_numa_scan(void)
#define SYSFS_NODE_PATH "/sys/devices/system/node"
GList *numa_nodes = NULL;
struct topo_obj unspecified_node = {
.load = 0,
.number = -1,
.obj_type = OBJ_TYPE_NODE,
.mask = CPU_MASK_ALL,
.interrupts = NULL,
.children = NULL,
.parent = NULL,
.obj_type_list = &numa_nodes,
};
static void add_one_node(const char *nodename)
{
int irq = -1;
cpumask_t mask;
int node_num;
do {
int type;
irq = get_next_irq(irq);
if (irq == -1)
break;
char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1);
struct topo_obj *new;
char *cpustr;
FILE *f;
mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK);
node_num = find_irq_integer_prop(irq, IRQ_NUMA);
type = find_irq_integer_prop(irq, IRQ_CLASS);
add_interrupt_numa(irq, mask, node_num, type);
} while (irq != -1);
if (!path)
return;
new = calloc(1, sizeof(struct topo_obj));
if (!new)
return;
sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename);
f = fopen(path, "r");
if (ferror(f)) {
cpus_clear(new->mask);
} else {
fscanf(f, "%as", &cpustr);
if (!cpustr) {
cpus_clear(new->mask);
} else {
cpumask_parse_user(cpustr, strlen(cpustr), new->mask);
free(cpustr);
}
}
new->obj_type = OBJ_TYPE_NODE;
new->number = strtoul(&nodename[4], NULL, 10);
new->obj_type_list = &numa_nodes;
numa_nodes = g_list_append(numa_nodes, new);
}
void build_numa_node_list(void)
{
DIR *dir = opendir(SYSFS_NODE_PATH);
struct dirent *entry;
do {
entry = readdir(dir);
if (!entry)
break;
if ((entry->d_type == DT_DIR) && (strstr(entry->d_name, "node"))) {
add_one_node(entry->d_name);
}
} while (entry);
}
static void free_numa_node(gpointer data)
{
free(data);
}
void free_numa_node_list(void)
{
g_list_free_full(numa_nodes, free_numa_node);
numa_nodes = NULL;
}
static gint compare_node(gconstpointer a, gconstpointer b)
{
const struct topo_obj *ai = a;
const struct topo_obj *bi = b;
return (ai->number == bi->number) ? 0 : 1;
}
void add_package_to_node(struct topo_obj *p, int nodeid)
{
struct topo_obj find, *node;
find.number = nodeid;
GList *entry;
find.number = nodeid;
entry = g_list_find_custom(numa_nodes, &find, compare_node);
if (!entry) {
if (debug_mode)
printf("Could not find numa node for node id %d\n", nodeid);
return;
}
node = entry->data;
if (!p->parent) {
node->children = g_list_append(node->children, p);
p->parent = node;
}
}
void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused)))
{
char buffer[4096];
printf("NUMA NODE NUMBER: %d\n", d->number);
cpumask_scnprintf(buffer, 4096, d->mask);
printf("LOCAL CPU MASK: %s\n", buffer);
printf("\n");
}
struct topo_obj *get_numa_node(int nodeid)
{
struct topo_obj find;
GList *entry;
if (nodeid == -1)
return &unspecified_node;
find.number = nodeid;
entry = g_list_find_custom(numa_nodes, &find, compare_node);
return entry ? entry->data : NULL;
}

View file

@ -30,355 +30,167 @@
int power_mode;
extern GList *interrupts, *packages, *cache_domains, *cpus;
GList *rebalance_irq_list;
static uint64_t package_cost_func(struct interrupt *irq, struct package *package)
struct obj_placement {
struct topo_obj *best;
struct topo_obj *least_irqs;
uint64_t best_cost;
struct irq_info *info;
};
static void find_best_object(struct topo_obj *d, void *data)
{
int bonus = 0;
int maxcount;
int dist;
/* moving to a cold package/cache/etc gets you a 3000 penalty */
if (!cpus_intersects(irq->old_mask, package->mask))
bonus = CROSS_PACKAGE_PENALTY;
struct obj_placement *best = (struct obj_placement *)data;
uint64_t newload;
cpumask_t subset;
/* do a little numa affinity */
if (irq->node_num != package->node_num) {
if (irq->node_num >= 0 && package->node_num >= 0) {
dist = numa_distance(irq->node_num, package->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
/*
* If the hint policy is subset, then we only want
* to consider objects that are within the irqs hint, but
* only if that irq in fact has published a hint
*/
if (hint_policy == HINT_POLICY_SUBSET) {
if (!cpus_empty(best->info->affinity_hint)) {
cpus_and(subset, best->info->affinity_hint, d->mask);
if (cpus_empty(subset))
return;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
if (d->powersave_mode)
return;
/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
if (power_mode)
bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number;
/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
if (package->class_count[irq->class]>=maxcount && !power_mode)
bonus += 300000;
/* if the package has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, package->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
{
int bonus = 0;
int dist;
/* moving to a cold cache gets you a 1500 penalty */
if (!cpus_intersects(irq->old_mask, cache_domain->mask))
bonus = CROSS_PACKAGE_PENALTY/2;
/* do a little numa affinity */
if (irq->node_num != cache_domain->node_num) {
if (irq->node_num >= 0 && cache_domain->node_num >= 0) {
dist = numa_distance(irq->node_num, cache_domain->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
newload = d->load;
if (newload < best->best_cost) {
best->best = d;
best->best_cost = newload;
best->least_irqs = NULL;
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
* cache domain */
if (irq->msi == 1)
bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
/* if the cache domain has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
{
int bonus = 0;
int dist;
/* moving to a colder core gets you a 1000 penalty */
if (!cpus_intersects(irq->old_mask, cpu->mask))
bonus = CROSS_PACKAGE_PENALTY/3;
/* do a little numa affinity */
if (irq->node_num != cpu->node_num) {
if (irq->node_num >= 0 && cpu->node_num >= 0) {
dist = numa_distance(irq->node_num, cpu->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/*
* since some chipsets only place at the first cpu, give a tiny preference to non-first
* cpus for specifically placed interrupts
*/
if (first_cpu(cpu->cache_mask)==cpu->number)
bonus++;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
/* if the core has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cpu->mask))
bonus += 600000;
return irq->workload + bonus;
}
static void place_cache_domain(struct package *package)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cache_domain *cache_domain;
iter = g_list_first(package->interrupts);
while (iter) {
struct cache_domain *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level <= BALANCE_PACKAGE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(package->cache_domains);
while (pkg) {
uint64_t newload;
cache_domain = pkg->data;
newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain);
if (newload < best_cost) {
best = cache_domain;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
package->interrupts = g_list_delete_link(package->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
if (newload == best->best_cost) {
if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts))
best->least_irqs = d;
}
}
static void place_core(struct cache_domain *cache_domain)
static void find_best_object_for_irq(struct irq_info *info, void *data)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cpu_core *cpu;
struct obj_placement place;
struct topo_obj *d = data;
struct topo_obj *asign;
if (!info->moved)
return;
iter = g_list_first(cache_domain->interrupts);
while (iter) {
struct cpu_core *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
switch (d->obj_type) {
case OBJ_TYPE_NODE:
if (info->level == BALANCE_NONE)
return;
break;
/* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */
if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(cache_domain->cpu_cores);
while (pkg) {
uint64_t newload;
case OBJ_TYPE_PACKAGE:
if (info->level == BALANCE_PACKAGE)
return;
break;
cpu = pkg->data;
newload = cpu->workload + cpu_cost_func(irq, cpu);
if (newload < best_cost) {
best = cpu;
best_cost = newload;
}
case OBJ_TYPE_CACHE:
if (info->level == BALANCE_CACHE)
return;
break;
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
case OBJ_TYPE_CPU:
if (info->level == BALANCE_CORE)
return;
break;
}
place.info = info;
place.best = NULL;
place.least_irqs = NULL;
place.best_cost = INT_MAX;
for_each_object(d->children, find_best_object, &place);
asign = place.least_irqs ? place.least_irqs : place.best;
if (asign) {
migrate_irq(&d->interrupts, &asign->interrupts, info);
info->assigned_obj = asign;
asign->load += info->load;
}
}
static void place_packages(GList *list)
static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused)))
{
GList *iter;
GList *pkg;
struct interrupt *irq;
struct package *package;
if (g_list_length(d->interrupts) > 0)
for_each_irq(d->interrupts, find_best_object_for_irq, d);
}
static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused)))
{
struct obj_placement place;
struct topo_obj *asign;
iter = g_list_first(list);
while (iter) {
struct package *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(packages);
while (pkg) {
uint64_t newload;
if( info->level == BALANCE_NONE)
return;
package = pkg->data;
newload = package->workload + package_cost_func(irq, package);
if (newload < best_cost) {
best = package;
best_cost = newload;
}
if (irq_numa_node(info)->number != -1) {
/*
* This irq belongs to a device with a preferred numa node
* put it on that node
*/
migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info);
info->assigned_obj = irq_numa_node(info);
irq_numa_node(info)->load += info->load + 1;
return;
}
pkg = g_list_next(pkg);
}
if (best) {
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
}
iter = g_list_next(iter);
place.best_cost = INT_MAX;
place.best = NULL;
place.least_irqs = NULL;
place.info = info;
for_each_object(numa_nodes, find_best_object, &place);
asign = place.least_irqs ? place.least_irqs : place.best;
if (asign) {
migrate_irq(&rebalance_irq_list, &asign->interrupts, info);
info->assigned_obj = asign;
asign->load += info->load;
}
}
static void place_affinity_hint(GList *list)
static void validate_irq(struct irq_info *info, void *data)
{
/* still need to balance best workload within the affinity_hint mask */
GList *iter;
struct interrupt *irq;
iter = g_list_first(list);
while (iter) {
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
if ((!cpus_empty(irq->node_mask)) &&
(!cpus_equal(irq->mask, irq->node_mask)) &&
(!__cpus_full(&irq->node_mask, num_possible_cpus()))) {
irq->old_mask = irq->mask;
irq->mask = irq->node_mask;
}
iter = g_list_next(iter);
}
if (info->assigned_obj != data)
printf("object validation error: irq %d is wrong, points to %p, should be %p\n",
info->irq, info->assigned_obj, data);
}
static void do_unroutables(void)
static void validate_object(struct topo_obj *d, void *data __attribute__((unused)))
{
struct package *package;
struct cache_domain *cache_domain;
struct cpu_core *cpu;
struct interrupt *irq;
GList *iter, *inter;
inter = g_list_first(interrupts);
while (inter) {
irq = inter->data;
inter = g_list_next(inter);
if (irq->balance_level != BALANCE_NONE)
continue;
iter = g_list_first(packages);
while (iter) {
package = iter->data;
if (cpus_intersects(package->mask, irq->node_mask) ||
cpus_intersects(package->mask, irq->mask))
package->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
if (cpus_intersects(cache_domain->mask, irq->node_mask)
|| cpus_intersects(cache_domain->mask, irq->mask))
cache_domain->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cpus);
while (iter) {
cpu = iter->data;
if (cpus_intersects(cpu->mask, irq->node_mask) ||
cpus_intersects(cpu->mask, irq->mask))
cpu->workload += irq->workload;
iter = g_list_next(iter);
}
}
if (d->interrupts)
for_each_irq(d->interrupts, validate_irq, d);
}
static void validate_object_tree_placement()
{
for_each_object(packages, validate_object, NULL);
for_each_object(cache_domains, validate_object, NULL);
for_each_object(cpus, validate_object, NULL);
}
void calculate_placement(void)
{
struct package *package;
struct cache_domain *cache_domain;
GList *iter;
/* first clear old data */
clear_work_stats();
sort_irq_list();
do_unroutables();
place_packages(interrupts);
iter = g_list_first(packages);
while (iter) {
package = iter->data;
place_cache_domain(package);
iter = g_list_next(iter);
sort_irq_list(&rebalance_irq_list);
if (g_list_length(rebalance_irq_list) > 0) {
for_each_irq(rebalance_irq_list, place_irq_in_node, NULL);
for_each_object(numa_nodes, place_irq_in_object, NULL);
for_each_object(packages, place_irq_in_object, NULL);
for_each_object(cache_domains, place_irq_in_object, NULL);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
place_core(cache_domain);
iter = g_list_next(iter);
}
/*
* if affinity_hint is populated on irq and is not set to
* all CPUs (meaning it's initialized), honor that above
* anything in the package locality/workload.
*/
place_affinity_hint(interrupts);
if (debug_mode)
validate_object_tree_placement();
}

View file

@ -28,54 +28,7 @@
#include "irqbalance.h"
extern int power_mode;
static uint64_t previous;
static unsigned int hysteresis;
void check_power_mode(void)
{
FILE *file;
char *line = NULL;
size_t size = 0;
char *c;
uint64_t dummy __attribute__((unused));
uint64_t irq, softirq;
file = fopen("/proc/stat", "r");
if (!file)
return;
if (getline(&line, &size, file)==0)
size=0;
fclose(file);
if (!line)
return;
c=&line[4];
dummy = strtoull(c, &c, 10); /* user */
dummy = strtoull(c, &c, 10); /* nice */
dummy = strtoull(c, &c, 10); /* system */
dummy = strtoull(c, &c, 10); /* idle */
dummy = strtoull(c, &c, 10); /* iowait */
irq = strtoull(c, &c, 10); /* irq */
softirq = strtoull(c, &c, 10); /* softirq */
irq += softirq;
printf("IRQ delta is %lu \n", (unsigned long)(irq - previous) );
if (irq - previous < POWER_MODE_SOFTIRQ_THRESHOLD) {
hysteresis++;
if (hysteresis > POWER_MODE_HYSTERESIS) {
if (debug_mode && !power_mode)
printf("IRQ delta is %lu, switching to power mode \n", (unsigned long)(irq - previous) );
power_mode = 1;
}
} else {
if (debug_mode && power_mode)
printf("IRQ delta is %lu, switching to performance mode \n", (unsigned long)(irq - previous) );
power_mode = 0;
hysteresis = 0;
}
previous = irq;
free(line);
}

View file

@ -25,6 +25,7 @@
#include <stdint.h>
#include <string.h>
#include <syslog.h>
#include <ctype.h>
#include "cpumask.h"
#include "irqbalance.h"
@ -39,7 +40,6 @@ void parse_proc_interrupts(void)
FILE *file;
char *line = NULL;
size_t size = 0;
int int_type;
file = fopen("/proc/interrupts", "r");
if (!file)
@ -48,6 +48,7 @@ void parse_proc_interrupts(void)
/* first line is the header we don't need; nuke it */
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
return;
}
@ -56,6 +57,7 @@ void parse_proc_interrupts(void)
int number;
uint64_t count;
char *c, *c2;
struct irq_info *info;
if (getline(&line, &size, file)==0)
break;
@ -65,7 +67,11 @@ void parse_proc_interrupts(void)
proc_int_has_msi = 1;
/* lines with letters in front are special, like NMI count. Ignore */
if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9')))
c = line;
while (isblank(*(c)))
c++;
if (!(*c>='0' && *c<='9'))
break;
c = strchr(line, ':');
if (!c)
@ -73,6 +79,10 @@ void parse_proc_interrupts(void)
*c = 0;
c++;
number = strtoul(line, NULL, 10);
info = get_irq_info(number);
if (!info)
info = add_misc_irq(number);
count = 0;
cpunr = 0;
@ -88,18 +98,13 @@ void parse_proc_interrupts(void)
}
if (cpunr != core_count)
need_cpu_rescan = 1;
set_interrupt_count(number, count);
info->last_irq_count = info->irq_count;
info->irq_count = count;
/* is interrupt MSI based? */
int_type = find_irq_integer_prop(number, IRQ_TYPE);
if ((int_type == IRQ_TYPE_MSI) || (int_type == IRQ_TYPE_MSIX)) {
if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX))
msi_found_in_sysfs = 1;
/* Set numa node for irq if it was MSI */
if (debug_mode)
printf("Set MSI interrupt for %d\n", number);
set_msi_interrupt_numa(number);
}
}
if ((proc_int_has_msi) && (!msi_found_in_sysfs)) {
syslog(LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n");
@ -113,3 +118,138 @@ void parse_proc_interrupts(void)
fclose(file);
free(line);
}
static void accumulate_irq_count(struct irq_info *info, void *data)
{
uint64_t *acc = data;
*acc += (info->irq_count - info->last_irq_count);
}
static void assign_load_slice(struct irq_info *info, void *data)
{
uint64_t *load_slice = data;
info->load = (info->irq_count - info->last_irq_count) * *load_slice;
/*
* Every IRQ has at least a load of 1
*/
if (!info->load)
info->load++;
}
/*
* Recursive helper to estimate the number of irqs shared between
* multiple topology objects that was handled by this particular object
*/
static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d)
{
uint64_t total_irq_count = 0;
if (d->parent) {
total_irq_count = get_parent_branch_irq_count_share(d->parent);
total_irq_count /= g_list_length(*d->obj_type_list);
}
if (g_list_length(d->interrupts) > 0)
for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_count);
return total_irq_count;
}
static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused)))
{
uint64_t local_irq_counts = 0;
uint64_t load_slice;
int load_divisor = g_list_length(d->children);
d->load /= (load_divisor ? load_divisor : 1);
if (g_list_length(d->interrupts) > 0) {
local_irq_counts = get_parent_branch_irq_count_share(d);
load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1;
for_each_irq(d->interrupts, assign_load_slice, &load_slice);
}
if (d->parent)
d->parent->load += d->load;
}
void parse_proc_stat()
{
FILE *file;
char *line = NULL;
size_t size = 0;
int cpunr, rc, cpucount;
struct topo_obj *cpu;
int irq_load, softirq_load;
file = fopen("/proc/stat", "r");
if (!file) {
syslog(LOG_WARNING, "WARNING cant open /proc/stat. balacing is broken\n");
return;
}
/* first line is the header we don't need; nuke it */
if (getline(&line, &size, file)==0) {
free(line);
syslog(LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n");
fclose(file);
return;
}
cpucount = 0;
while (!feof(file)) {
if (getline(&line, &size, file)==0)
break;
if (!strstr(line, "cpu"))
break;
cpunr = strtoul(&line[3], NULL, 10);
rc = sscanf(line, "%*s %*d %*d %*d %*d %*d %d %d", &irq_load, &softirq_load);
if (rc < 2)
break;
cpu = find_cpu_core(cpunr);
if (!cpu)
break;
cpucount++;
/*
* For each cpu add the irq and softirq load and propagate that
* all the way up the device tree
*/
if (cycle_count) {
cpu->load = (irq_load + softirq_load) - (cpu->last_load);
/*
* the [soft]irq_load values are in jiffies, which are
* units of 10ms, multiply by 1000 to convert that to
* 1/10 milliseconds. This give us a better integer
* distribution of load between irqs
*/
cpu->load *= 1000;
}
cpu->last_load = (irq_load + softirq_load);
}
fclose(file);
if (cpucount != get_cpu_count()) {
syslog(LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n");
return;
}
/*
* Now that we have load for each cpu attribute a fair share of the load
* to each irq on that cpu
*/
for_each_object(cpus, compute_irq_branch_load_share, NULL);
for_each_object(cache_domains, compute_irq_branch_load_share, NULL);
for_each_object(packages, compute_irq_branch_load_share, NULL);
for_each_object(numa_nodes, compute_irq_branch_load_share, NULL);
}

108
types.h
View file

@ -26,89 +26,39 @@
#define IRQ_TYPE_MSI 1
#define IRQ_TYPE_MSIX 2
/*
* IRQ properties
*/
enum irq_prop {
IRQ_CLASS = 0,
IRQ_TYPE,
IRQ_NUMA,
IRQ_LCPU_MASK,
IRQ_MAX_PROPERTY
enum obj_type_e {
OBJ_TYPE_CPU,
OBJ_TYPE_CACHE,
OBJ_TYPE_PACKAGE,
OBJ_TYPE_NODE
};
struct package {
uint64_t workload;
int number;
cpumask_t mask;
int node_num;
int class_count[7];
GList *cache_domains;
GList *interrupts;
struct topo_obj {
uint64_t load;
uint64_t last_load;
enum obj_type_e obj_type;
int number;
int powersave_mode;
cpumask_t mask;
GList *interrupts;
struct topo_obj *parent;
GList *children;
GList **obj_type_list;
};
struct cache_domain {
uint64_t workload;
int number;
int marker;
int node_num;
cpumask_t mask;
cpumask_t package_mask;
int class_count[7];
GList *cpu_cores;
GList *interrupts;
struct irq_info {
int irq;
int class;
int type;
int level;
struct topo_obj *numa_node;
cpumask_t cpumask;
cpumask_t affinity_hint;
uint64_t irq_count;
uint64_t last_irq_count;
uint64_t load;
int moved;
struct topo_obj *assigned_obj;
};
struct cpu_core {
uint64_t workload;
int number;
int marker;
int node_num;
int class_count[7];
cpumask_t package_mask;
cpumask_t cache_mask;
cpumask_t mask;
GList *interrupts;
};
struct interrupt {
uint64_t workload;
int balance_level;
int number;
int class;
int node_num;
int msi;
uint64_t count;
uint64_t old_count;
uint64_t extra;
cpumask_t mask;
cpumask_t old_mask;
cpumask_t numa_mask;
cpumask_t allowed_mask;
/* user/driver provided for smarter balancing */
cpumask_t node_mask;
};
#endif