Merge branch 'new_balancer'
Conflicts: classify.c
This commit is contained in:
commit
430c88e502
|
@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies
|
|||
EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4
|
||||
|
||||
INCLUDES = -I${top_srcdir}
|
||||
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma
|
||||
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma -lm
|
||||
AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE
|
||||
noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \
|
||||
types.h
|
||||
|
|
50
activate.c
50
activate.c
|
@ -32,30 +32,40 @@
|
|||
#include "irqbalance.h"
|
||||
|
||||
|
||||
void activate_mapping(void)
|
||||
static void activate_mapping(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
struct interrupt *irq;
|
||||
GList *iter;
|
||||
|
||||
iter = g_list_first(interrupts);
|
||||
while (iter) {
|
||||
irq = iter->data;
|
||||
iter = g_list_next(iter);
|
||||
|
||||
/* don't set the level if it's a NONE irq, or if there is
|
||||
* no change */
|
||||
if (irq->balance_level != BALANCE_NONE &&
|
||||
!cpus_equal(irq->mask, irq->old_mask)) {
|
||||
char buf[PATH_MAX];
|
||||
FILE *file;
|
||||
sprintf(buf, "/proc/irq/%i/smp_affinity", irq->number);
|
||||
cpumask_t applied_mask;
|
||||
|
||||
/*
|
||||
* only activate mappings for irqs that have moved
|
||||
*/
|
||||
if (!info->moved)
|
||||
return;
|
||||
|
||||
if (!info->assigned_obj)
|
||||
return;
|
||||
|
||||
|
||||
sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq);
|
||||
file = fopen(buf, "w");
|
||||
if (!file)
|
||||
continue;
|
||||
cpumask_scnprintf(buf, PATH_MAX, irq->mask);
|
||||
fprintf(file,"%s", buf);
|
||||
return;
|
||||
|
||||
if ((hint_policy == HINT_POLICY_EXACT) &&
|
||||
(!cpus_empty(info->affinity_hint)))
|
||||
applied_mask = info->affinity_hint;
|
||||
else
|
||||
applied_mask = info->assigned_obj->mask;
|
||||
|
||||
cpumask_scnprintf(buf, PATH_MAX, applied_mask);
|
||||
fprintf(file, "%s", buf);
|
||||
fclose(file);
|
||||
irq->old_mask = irq->mask;
|
||||
}
|
||||
}
|
||||
info->moved = 0; /*migration is done*/
|
||||
}
|
||||
|
||||
void activate_mappings(void)
|
||||
{
|
||||
for_each_irq(NULL, activate_mapping, NULL);
|
||||
}
|
||||
|
|
205
classify.c
205
classify.c
|
@ -25,8 +25,6 @@ int map_class_to_level[7] =
|
|||
{ BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CACHE, BALANCE_NONE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE };
|
||||
|
||||
|
||||
int class_counts[7];
|
||||
|
||||
#define MAX_CLASS 0x12
|
||||
/*
|
||||
* Class codes lifted from pci spec, appendix D.
|
||||
|
@ -56,35 +54,6 @@ static short class_codes[MAX_CLASS] = {
|
|||
static GList *interrupts_db;
|
||||
|
||||
#define SYSDEV_DIR "/sys/bus/pci/devices"
|
||||
union property {
|
||||
int int_val;
|
||||
cpumask_t mask_val;
|
||||
};
|
||||
|
||||
enum irq_type {
|
||||
INT_TYPE = 0,
|
||||
CPUMASK_TYPE,
|
||||
};
|
||||
|
||||
struct irq_property {
|
||||
enum irq_type itype;
|
||||
union property iproperty;
|
||||
};
|
||||
#define iint_val iproperty.int_val
|
||||
#define imask_val iproperty.mask_val
|
||||
|
||||
struct irq_info {
|
||||
int irq;
|
||||
struct irq_property property[IRQ_MAX_PROPERTY];
|
||||
};
|
||||
|
||||
static void init_new_irq(struct irq_info *new)
|
||||
{
|
||||
new->property[IRQ_CLASS].itype = INT_TYPE;
|
||||
new->property[IRQ_TYPE].itype = INT_TYPE;
|
||||
new->property[IRQ_NUMA].itype = INT_TYPE;
|
||||
new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE;
|
||||
}
|
||||
|
||||
static gint compare_ints(gconstpointer a, gconstpointer b)
|
||||
{
|
||||
|
@ -94,11 +63,6 @@ static gint compare_ints(gconstpointer a, gconstpointer b)
|
|||
return ai->irq - bi->irq;
|
||||
}
|
||||
|
||||
static void free_int(gpointer data)
|
||||
{
|
||||
free(data);
|
||||
}
|
||||
|
||||
/*
|
||||
* Inserts an irq_info struct into the intterupts_db list
|
||||
* devpath points to the device directory in sysfs for the
|
||||
|
@ -126,13 +90,12 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
new = malloc(sizeof(struct irq_info));
|
||||
new = calloc(sizeof(struct irq_info), 1);
|
||||
if (!new)
|
||||
return NULL;
|
||||
init_new_irq(new);
|
||||
|
||||
new->irq = irq;
|
||||
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
|
||||
new->class = IRQ_OTHER;
|
||||
|
||||
interrupts_db = g_list_append(interrupts_db, new);
|
||||
|
||||
|
@ -159,7 +122,9 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
|
|||
if (class >= MAX_CLASS)
|
||||
goto get_numa_node;
|
||||
|
||||
new->property[IRQ_CLASS].iint_val = class_codes[class];
|
||||
new->class = class_codes[class];
|
||||
new->level = map_class_to_level[class_codes[class]];
|
||||
|
||||
get_numa_node:
|
||||
numa_node = -1;
|
||||
sprintf(path, "%s/numa_node", devpath);
|
||||
|
@ -171,24 +136,39 @@ get_numa_node:
|
|||
fclose(fd);
|
||||
|
||||
assign_node:
|
||||
new->property[IRQ_NUMA].iint_val = numa_node;
|
||||
new->numa_node = get_numa_node(numa_node);
|
||||
|
||||
sprintf(path, "%s/local_cpus", devpath);
|
||||
fd = fopen(path, "r");
|
||||
if (!fd) {
|
||||
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
|
||||
goto out;
|
||||
cpus_setall(new->cpumask);
|
||||
goto assign_affinity_hint;
|
||||
}
|
||||
lcpu_mask = NULL;
|
||||
rc = fscanf(fd, "%as", &lcpu_mask);
|
||||
fclose(fd);
|
||||
if (!lcpu_mask) {
|
||||
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
|
||||
if (!lcpu_mask || !rc) {
|
||||
cpus_setall(new->cpumask);
|
||||
} else {
|
||||
cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
|
||||
new->property[IRQ_LCPU_MASK].imask_val);
|
||||
free(lcpu_mask);
|
||||
new->cpumask);
|
||||
}
|
||||
free(lcpu_mask);
|
||||
|
||||
assign_affinity_hint:
|
||||
cpus_clear(new->affinity_hint);
|
||||
sprintf(path, "/proc/irq/%d/affinity_hint", irq);
|
||||
fd = fopen(path, "r");
|
||||
if (!fd)
|
||||
goto out;
|
||||
lcpu_mask = NULL;
|
||||
rc = fscanf(fd, "%as", &lcpu_mask);
|
||||
fclose(fd);
|
||||
if (!lcpu_mask)
|
||||
goto out;
|
||||
cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
|
||||
new->affinity_hint);
|
||||
free(lcpu_mask);
|
||||
out:
|
||||
if (debug_mode)
|
||||
printf("Adding IRQ %d to database\n", irq);
|
||||
|
@ -226,7 +206,7 @@ static void build_one_dev_entry(const char *dirname)
|
|||
new = add_one_irq_to_db(path, irqnum);
|
||||
if (!new)
|
||||
continue;
|
||||
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX;
|
||||
new->type = IRQ_TYPE_MSIX;
|
||||
}
|
||||
} while (entry != NULL);
|
||||
closedir(msidir);
|
||||
|
@ -248,20 +228,32 @@ static void build_one_dev_entry(const char *dirname)
|
|||
new = add_one_irq_to_db(path, irqnum);
|
||||
if (!new)
|
||||
goto done;
|
||||
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
|
||||
new->type = IRQ_TYPE_LEGACY;
|
||||
}
|
||||
|
||||
done:
|
||||
fclose(fd);
|
||||
return;
|
||||
}
|
||||
|
||||
static void free_irq(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
free(info);
|
||||
}
|
||||
|
||||
void free_irq_db(void)
|
||||
{
|
||||
for_each_irq(NULL, free_irq, NULL);
|
||||
g_list_free(interrupts_db);
|
||||
interrupts_db = NULL;
|
||||
}
|
||||
|
||||
void rebuild_irq_db(void)
|
||||
{
|
||||
DIR *devdir = opendir(SYSDEV_DIR);
|
||||
struct dirent *entry;
|
||||
|
||||
g_list_free_full(interrupts_db, free_int);
|
||||
free_irq_db();
|
||||
|
||||
if (!devdir)
|
||||
return;
|
||||
|
@ -278,83 +270,80 @@ void rebuild_irq_db(void)
|
|||
closedir(devdir);
|
||||
}
|
||||
|
||||
static GList *add_misc_irq(int irq)
|
||||
struct irq_info *add_misc_irq(int irq)
|
||||
{
|
||||
struct irq_info *new, find;
|
||||
struct irq_info *new;
|
||||
|
||||
new = malloc(sizeof(struct irq_info));
|
||||
new = calloc(sizeof(struct irq_info), 1);
|
||||
if (!new)
|
||||
return NULL;
|
||||
init_new_irq(new);
|
||||
|
||||
new->irq = irq;
|
||||
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
|
||||
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
|
||||
new->property[IRQ_NUMA].iint_val = -1;
|
||||
new->type = IRQ_TYPE_LEGACY;
|
||||
new->class = IRQ_OTHER;
|
||||
new->numa_node = get_numa_node(0);
|
||||
interrupts_db = g_list_append(interrupts_db, new);
|
||||
find.irq = irq;
|
||||
return g_list_find_custom(interrupts_db, &find, compare_ints);
|
||||
return new;
|
||||
}
|
||||
|
||||
int find_irq_integer_prop(int irq, enum irq_prop prop)
|
||||
void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data)
|
||||
{
|
||||
GList *entry;
|
||||
struct irq_info find, *result;
|
||||
GList *entry = g_list_first(list ? list : interrupts_db);
|
||||
GList *next;
|
||||
|
||||
find.irq = irq;
|
||||
|
||||
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
|
||||
|
||||
if (!entry) {
|
||||
if (debug_mode)
|
||||
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
|
||||
entry = add_misc_irq(irq);
|
||||
while (entry) {
|
||||
next = g_list_next(entry);
|
||||
cb(entry->data, data);
|
||||
entry = next;
|
||||
}
|
||||
|
||||
result = entry->data;
|
||||
assert(result->property[prop].itype == INT_TYPE);
|
||||
return result->property[prop].iint_val;
|
||||
}
|
||||
|
||||
cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop)
|
||||
struct irq_info *get_irq_info(int irq)
|
||||
{
|
||||
GList *entry;
|
||||
struct irq_info find, *result;
|
||||
|
||||
find.irq = irq;
|
||||
|
||||
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
|
||||
|
||||
if (!entry) {
|
||||
if (debug_mode)
|
||||
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
|
||||
entry = add_misc_irq(irq);
|
||||
}
|
||||
|
||||
result = entry->data;
|
||||
assert(result->property[prop].itype == CPUMASK_TYPE);
|
||||
return result->property[prop].imask_val;
|
||||
}
|
||||
|
||||
int get_next_irq(int irq)
|
||||
{
|
||||
GList *entry;
|
||||
struct irq_info *irqp, find;
|
||||
|
||||
if (irq == -1) {
|
||||
entry = g_list_first(interrupts_db);
|
||||
irqp = entry->data;
|
||||
return irqp->irq;
|
||||
}
|
||||
struct irq_info find;
|
||||
|
||||
find.irq = irq;
|
||||
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
|
||||
if (!entry)
|
||||
return entry ? entry->data : NULL;
|
||||
}
|
||||
|
||||
void migrate_irq(GList **from, GList **to, struct irq_info *info)
|
||||
{
|
||||
GList *entry;
|
||||
struct irq_info find, *tmp;;
|
||||
|
||||
find.irq = info->irq;
|
||||
entry = g_list_find_custom(*from, &find, compare_ints);
|
||||
tmp = entry->data;
|
||||
*from = g_list_delete_link(*from, entry);
|
||||
|
||||
|
||||
*to = g_list_append(*to, tmp);
|
||||
info->moved = 1;
|
||||
}
|
||||
|
||||
static gint sort_irqs(gconstpointer A, gconstpointer B)
|
||||
{
|
||||
struct irq_info *a, *b;
|
||||
a = (struct irq_info*)A;
|
||||
b = (struct irq_info*)B;
|
||||
|
||||
if (a->class < b->class)
|
||||
return 1;
|
||||
if (a->class > b->class)
|
||||
return -1;
|
||||
if (a->load < b->load)
|
||||
return 1;
|
||||
if (a->load > b->load)
|
||||
return -1;
|
||||
if (a<b)
|
||||
return 1;
|
||||
return -1;
|
||||
|
||||
entry = g_list_next(entry);
|
||||
if (!entry)
|
||||
return -1;
|
||||
irqp= entry->data;
|
||||
return irqp->irq;
|
||||
}
|
||||
|
||||
void sort_irq_list(GList **list)
|
||||
{
|
||||
*list = g_list_sort(*list, sort_irqs);
|
||||
}
|
||||
|
|
|
@ -51,10 +51,11 @@ AC_PROG_AWK
|
|||
echo .
|
||||
echo Checking for header files
|
||||
AC_HEADER_STDC
|
||||
AC_CHECK_HEADERS(linux/ethtool.h linux/sockios.h, [], [])
|
||||
|
||||
AC_CHECK_FUNCS(getopt_long)
|
||||
|
||||
AC_CHECK_LIB(numa, numa_run_on_node, [], [])
|
||||
AC_CHECK_LIB(m, floor, [], [])
|
||||
|
||||
AC_C_CONST
|
||||
AC_C_INLINE
|
||||
|
|
358
cputree.c
358
cputree.c
|
@ -55,122 +55,103 @@ cpumask_t cpu_possible_map;
|
|||
*/
|
||||
static cpumask_t unbanned_cpus;
|
||||
|
||||
static int search_numa_node(cpumask_t mask)
|
||||
{
|
||||
int node_num, ret;
|
||||
struct bitmask *node_mask;
|
||||
cpumask_t cpu_node_mask;
|
||||
|
||||
node_num = numa_num_configured_nodes();
|
||||
|
||||
if (node_num < 1)
|
||||
return -1;
|
||||
|
||||
node_mask = numa_allocate_cpumask();
|
||||
|
||||
node_num--; /* indexing from zero */
|
||||
|
||||
while (node_num >= 0) {
|
||||
ret = numa_node_to_cpus(node_num, node_mask);
|
||||
if (ret) {
|
||||
node_num--;
|
||||
continue;
|
||||
}
|
||||
memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long));
|
||||
if (cpus_intersects(mask, cpu_node_mask)) {
|
||||
numa_free_cpumask(node_mask);
|
||||
return node_num;
|
||||
}
|
||||
node_num--;
|
||||
}
|
||||
|
||||
numa_free_cpumask(node_mask);
|
||||
return node_num;
|
||||
}
|
||||
|
||||
static void fill_packages(void)
|
||||
static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache,
|
||||
cpumask_t package_mask)
|
||||
{
|
||||
GList *entry;
|
||||
struct topo_obj *package;
|
||||
struct topo_obj *lcache;
|
||||
|
||||
entry = g_list_first(packages);
|
||||
|
||||
entry = g_list_first(cache_domains);
|
||||
while (entry) {
|
||||
struct package *package;
|
||||
struct cache_domain *cache = NULL;
|
||||
GList *entry2;
|
||||
|
||||
cache = entry->data;
|
||||
entry2 = entry;
|
||||
entry = g_list_next(entry);
|
||||
if (cache->marker)
|
||||
continue;
|
||||
package = malloc(sizeof(struct package));
|
||||
if (!package)
|
||||
package = entry->data;
|
||||
if (cpus_equal(package_mask, package->mask))
|
||||
break;
|
||||
memset(package, 0, sizeof(struct package));
|
||||
package->mask = cache->package_mask;
|
||||
package->number = cache->number;
|
||||
package->node_num = search_numa_node(package->mask);
|
||||
while (entry2) {
|
||||
struct cache_domain *cache2;
|
||||
cache2 = entry2->data;
|
||||
if (cpus_equal(cache->package_mask, cache2->package_mask)) {
|
||||
cache2->marker = 1;
|
||||
package->cache_domains = g_list_append(package->cache_domains, cache2);
|
||||
if (package->number > cache2->number)
|
||||
package->number = cache2->number;
|
||||
}
|
||||
entry2 = g_list_next(entry2);
|
||||
entry = g_list_next(entry);
|
||||
}
|
||||
|
||||
if (!entry) {
|
||||
package = calloc(sizeof(struct topo_obj), 1);
|
||||
if (!package)
|
||||
return NULL;
|
||||
package->mask = package_mask;
|
||||
package->obj_type = OBJ_TYPE_PACKAGE;
|
||||
package->obj_type_list = &packages;
|
||||
packages = g_list_append(packages, package);
|
||||
package_count++;
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_cache_domain(void)
|
||||
entry = g_list_first(package->children);
|
||||
while (entry) {
|
||||
lcache = entry->data;
|
||||
if (lcache == cache)
|
||||
break;
|
||||
entry = g_list_next(entry);
|
||||
}
|
||||
|
||||
if (!entry) {
|
||||
package->children = g_list_append(package->children, cache);
|
||||
cache->parent = package;
|
||||
}
|
||||
|
||||
return package;
|
||||
}
|
||||
static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu,
|
||||
cpumask_t cache_mask)
|
||||
{
|
||||
GList *entry;
|
||||
struct topo_obj *cache;
|
||||
struct topo_obj *lcpu;
|
||||
|
||||
entry = g_list_first(cache_domains);
|
||||
|
||||
entry = g_list_first(cpus);
|
||||
while (entry) {
|
||||
struct cache_domain *cache = NULL;
|
||||
struct cpu_core *cpu;
|
||||
GList *entry2;
|
||||
cpu = entry->data;
|
||||
entry2 = entry;
|
||||
entry = g_list_next(entry);
|
||||
if (cpu->marker)
|
||||
continue;
|
||||
cache = malloc(sizeof(struct cache_domain));
|
||||
if (!cache)
|
||||
cache = entry->data;
|
||||
if (cpus_equal(cache_mask, cache->mask))
|
||||
break;
|
||||
memset(cache, 0, sizeof(struct cache_domain));
|
||||
cache->mask = cpu->cache_mask;
|
||||
cache->package_mask = cpu->package_mask;
|
||||
cache->number = cpu->number;
|
||||
cache->node_num = search_numa_node(cache->mask);
|
||||
entry = g_list_next(entry);
|
||||
}
|
||||
|
||||
if (!entry) {
|
||||
cache = calloc(sizeof(struct topo_obj), 1);
|
||||
if (!cache)
|
||||
return NULL;
|
||||
cache->obj_type = OBJ_TYPE_CACHE;
|
||||
cache->mask = cache_mask;
|
||||
cache->number = cache_domain_count;
|
||||
cache->obj_type_list = &cache_domains;
|
||||
cache_domains = g_list_append(cache_domains, cache);
|
||||
cache_domain_count++;
|
||||
while (entry2) {
|
||||
struct cpu_core *cpu2;
|
||||
cpu2 = entry2->data;
|
||||
if (cpus_equal(cpu->cache_mask, cpu2->cache_mask) &&
|
||||
cpus_equal(cpu->package_mask, cpu2->package_mask)) {
|
||||
cpu2->marker = 1;
|
||||
cache->cpu_cores = g_list_append(cache->cpu_cores, cpu2);
|
||||
if (cpu2->number < cache->number)
|
||||
cache->number = cpu2->number;
|
||||
}
|
||||
entry2 = g_list_next(entry2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
entry = g_list_first(cache->children);
|
||||
while (entry) {
|
||||
lcpu = entry->data;
|
||||
if (lcpu == cpu)
|
||||
break;
|
||||
entry = g_list_next(entry);
|
||||
}
|
||||
|
||||
if (!entry) {
|
||||
cache->children = g_list_append(cache->children, cpu);
|
||||
cpu->parent = (struct topo_obj *)cache;
|
||||
}
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
static void do_one_cpu(char *path)
|
||||
{
|
||||
struct cpu_core *cpu;
|
||||
struct topo_obj *cpu;
|
||||
FILE *file;
|
||||
char new_path[PATH_MAX];
|
||||
cpumask_t cache_mask, package_mask;
|
||||
struct topo_obj *cache;
|
||||
struct topo_obj *package;
|
||||
DIR *dir;
|
||||
struct dirent *entry;
|
||||
int nodeid;
|
||||
|
||||
/* skip offline cpus */
|
||||
snprintf(new_path, PATH_MAX, "%s/online", path);
|
||||
|
@ -188,10 +169,11 @@ static void do_one_cpu(char *path)
|
|||
free(line);
|
||||
}
|
||||
|
||||
cpu = malloc(sizeof(struct cpu_core));
|
||||
cpu = calloc(sizeof(struct topo_obj), 1);
|
||||
if (!cpu)
|
||||
return;
|
||||
memset(cpu, 0, sizeof(struct cpu_core));
|
||||
|
||||
cpu->obj_type = OBJ_TYPE_CPU;
|
||||
|
||||
cpu->number = strtoul(&path[27], NULL, 10);
|
||||
|
||||
|
@ -199,9 +181,6 @@ static void do_one_cpu(char *path)
|
|||
|
||||
cpu_set(cpu->number, cpu->mask);
|
||||
|
||||
/* set numa node of cpu */
|
||||
cpu->node_num = search_numa_node(cpu->mask);
|
||||
|
||||
/* if the cpu is on the banned list, just don't add it */
|
||||
if (cpus_intersects(cpu->mask, banned_cpus)) {
|
||||
free(cpu);
|
||||
|
@ -214,26 +193,26 @@ static void do_one_cpu(char *path)
|
|||
/* try to read the package mask; if it doesn't exist assume solitary */
|
||||
snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path);
|
||||
file = fopen(new_path, "r");
|
||||
cpu_set(cpu->number, cpu->package_mask);
|
||||
cpu_set(cpu->number, package_mask);
|
||||
if (file) {
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
if (getline(&line, &size, file))
|
||||
cpumask_parse_user(line, strlen(line), cpu->package_mask);
|
||||
cpumask_parse_user(line, strlen(line), package_mask);
|
||||
fclose(file);
|
||||
free(line);
|
||||
}
|
||||
|
||||
/* try to read the cache mask; if it doesn't exist assume solitary */
|
||||
/* We want the deepest cache level available so try index1 first, then index2 */
|
||||
cpu_set(cpu->number, cpu->cache_mask);
|
||||
cpu_set(cpu->number, cache_mask);
|
||||
snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path);
|
||||
file = fopen(new_path, "r");
|
||||
if (file) {
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
if (getline(&line, &size, file))
|
||||
cpumask_parse_user(line, strlen(line), cpu->cache_mask);
|
||||
cpumask_parse_user(line, strlen(line), cache_mask);
|
||||
fclose(file);
|
||||
free(line);
|
||||
}
|
||||
|
@ -243,66 +222,94 @@ static void do_one_cpu(char *path)
|
|||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
if (getline(&line, &size, file))
|
||||
cpumask_parse_user(line, strlen(line), cpu->cache_mask);
|
||||
cpumask_parse_user(line, strlen(line), cache_mask);
|
||||
fclose(file);
|
||||
free(line);
|
||||
}
|
||||
|
||||
nodeid=0;
|
||||
dir = opendir(path);
|
||||
do {
|
||||
entry = readdir(dir);
|
||||
if (!entry)
|
||||
break;
|
||||
if (strstr(entry->d_name, "node")) {
|
||||
nodeid = strtoul(&entry->d_name[4], NULL, 10);
|
||||
break;
|
||||
}
|
||||
} while (entry);
|
||||
closedir(dir);
|
||||
|
||||
cache = add_cpu_to_cache_domain(cpu, cache_mask);
|
||||
package = add_cache_domain_to_package(cache, package_mask);
|
||||
add_package_to_node(package, nodeid);
|
||||
|
||||
/*
|
||||
blank out the banned cpus from the various masks so that interrupts
|
||||
will never be told to go there
|
||||
*/
|
||||
cpus_and(cpu->cache_mask, cpu->cache_mask, unbanned_cpus);
|
||||
cpus_and(cpu->package_mask, cpu->package_mask, unbanned_cpus);
|
||||
cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus);
|
||||
cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus);
|
||||
cpus_and(cpu->mask, cpu->mask, unbanned_cpus);
|
||||
|
||||
cpu->obj_type_list = &cpus;
|
||||
cpus = g_list_append(cpus, cpu);
|
||||
core_count++;
|
||||
}
|
||||
|
||||
static void dump_irqs(int spaces, GList *dump_interrupts)
|
||||
static void dump_irq(struct irq_info *info, void *data)
|
||||
{
|
||||
struct interrupt *irq;
|
||||
while (dump_interrupts) {
|
||||
int spaces = (long int)data;
|
||||
int i;
|
||||
for (i=0; i<spaces; i++) printf(" ");
|
||||
irq = dump_interrupts->data;
|
||||
printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload);
|
||||
dump_interrupts = g_list_next(dump_interrupts);
|
||||
}
|
||||
printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load);
|
||||
}
|
||||
|
||||
static void dump_topo_obj(struct topo_obj *d, void *data __attribute__((unused)))
|
||||
{
|
||||
struct topo_obj *c = (struct topo_obj *)d;
|
||||
printf(" CPU number %i numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load);
|
||||
if (c->interrupts)
|
||||
for_each_irq(c->interrupts, dump_irq, (void *)18);
|
||||
}
|
||||
|
||||
static void dump_cache_domain(struct topo_obj *d, void *data)
|
||||
{
|
||||
char *buffer = data;
|
||||
cpumask_scnprintf(buffer, 4095, d->mask);
|
||||
printf(" Cache domain %i: numa_node is %d cpu mask is %s (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load);
|
||||
if (d->children)
|
||||
for_each_object(d->children, dump_topo_obj, NULL);
|
||||
if (d->interrupts)
|
||||
for_each_irq(d->interrupts, dump_irq, (void *)10);
|
||||
}
|
||||
|
||||
static void dump_package(struct topo_obj *d, void *data)
|
||||
{
|
||||
char *buffer = data;
|
||||
cpumask_scnprintf(buffer, 4096, d->mask);
|
||||
printf("Package %i: numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load);
|
||||
if (d->children)
|
||||
for_each_object(d->children, dump_cache_domain, buffer);
|
||||
if (d->interrupts)
|
||||
for_each_irq(d->interrupts, dump_irq, (void *)2);
|
||||
}
|
||||
|
||||
void dump_tree(void)
|
||||
{
|
||||
GList *p_iter, *c_iter, *cp_iter;
|
||||
struct package *package;
|
||||
struct cache_domain *cache_domain;
|
||||
struct cpu_core *cpu;
|
||||
|
||||
char buffer[4096];
|
||||
p_iter = g_list_first(packages);
|
||||
while (p_iter) {
|
||||
package = p_iter->data;
|
||||
cpumask_scnprintf(buffer, 4096, package->mask);
|
||||
printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload);
|
||||
c_iter = g_list_first(package->cache_domains);
|
||||
while (c_iter) {
|
||||
cache_domain = c_iter->data;
|
||||
c_iter = g_list_next(c_iter);
|
||||
cpumask_scnprintf(buffer, 4095, cache_domain->mask);
|
||||
printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload);
|
||||
cp_iter = cache_domain->cpu_cores;
|
||||
while (cp_iter) {
|
||||
cpu = cp_iter->data;
|
||||
cp_iter = g_list_next(cp_iter);
|
||||
printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload);
|
||||
dump_irqs(18, cpu->interrupts);
|
||||
}
|
||||
dump_irqs(10, cache_domain->interrupts);
|
||||
}
|
||||
dump_irqs(2, package->interrupts);
|
||||
p_iter = g_list_next(p_iter);
|
||||
}
|
||||
for_each_object(packages, dump_package, buffer);
|
||||
}
|
||||
|
||||
static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
info->load = 0;
|
||||
}
|
||||
|
||||
static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused)))
|
||||
{
|
||||
for_each_object(d->children, clear_obj_stats, NULL);
|
||||
for_each_irq(d->interrupts, clear_irq_stats, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -310,40 +317,9 @@ void dump_tree(void)
|
|||
* which level does how much work and the actual lists of interrupts
|
||||
* assigned to each component
|
||||
*/
|
||||
void clear_work_stats(void)
|
||||
void clear_work_stats()
|
||||
{
|
||||
GList *p_iter, *c_iter, *cp_iter;
|
||||
struct package *package;
|
||||
struct cache_domain *cache_domain;
|
||||
struct cpu_core *cpu;
|
||||
|
||||
p_iter = g_list_first(packages);
|
||||
while (p_iter) {
|
||||
package = p_iter->data;
|
||||
package->workload = 0;
|
||||
g_list_free(package->interrupts);
|
||||
package->interrupts = NULL;
|
||||
c_iter = g_list_first(package->cache_domains);
|
||||
memset(package->class_count, 0, sizeof(package->class_count));
|
||||
while (c_iter) {
|
||||
cache_domain = c_iter->data;
|
||||
c_iter = g_list_next(c_iter);
|
||||
cache_domain->workload = 0;
|
||||
cp_iter = cache_domain->cpu_cores;
|
||||
g_list_free(cache_domain->interrupts);
|
||||
cache_domain->interrupts = NULL;
|
||||
memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count));
|
||||
while (cp_iter) {
|
||||
cpu = cp_iter->data;
|
||||
cp_iter = g_list_next(cp_iter);
|
||||
cpu->workload = 0;
|
||||
g_list_free(cpu->interrupts);
|
||||
cpu->interrupts = NULL;
|
||||
memset(cpu->class_count, 0, sizeof(cpu->class_count));
|
||||
}
|
||||
}
|
||||
p_iter = g_list_next(p_iter);
|
||||
}
|
||||
for_each_object(numa_nodes, clear_obj_stats, NULL);
|
||||
}
|
||||
|
||||
|
||||
|
@ -373,9 +349,6 @@ void parse_cpu_tree(void)
|
|||
} while (entry);
|
||||
closedir(dir);
|
||||
|
||||
fill_cache_domain();
|
||||
fill_packages();
|
||||
|
||||
if (debug_mode)
|
||||
dump_tree();
|
||||
|
||||
|
@ -389,14 +362,14 @@ void parse_cpu_tree(void)
|
|||
void clear_cpu_tree(void)
|
||||
{
|
||||
GList *item;
|
||||
struct cpu_core *cpu;
|
||||
struct cache_domain *cache_domain;
|
||||
struct package *package;
|
||||
struct topo_obj *cpu;
|
||||
struct topo_obj *cache_domain;
|
||||
struct topo_obj *package;
|
||||
|
||||
while (packages) {
|
||||
item = g_list_first(packages);
|
||||
package = item->data;
|
||||
g_list_free(package->cache_domains);
|
||||
g_list_free(package->children);
|
||||
g_list_free(package->interrupts);
|
||||
free(package);
|
||||
packages = g_list_delete_link(packages, item);
|
||||
|
@ -406,7 +379,7 @@ void clear_cpu_tree(void)
|
|||
while (cache_domains) {
|
||||
item = g_list_first(cache_domains);
|
||||
cache_domain = item->data;
|
||||
g_list_free(cache_domain->cpu_cores);
|
||||
g_list_free(cache_domain->children);
|
||||
g_list_free(cache_domain->interrupts);
|
||||
free(cache_domain);
|
||||
cache_domains = g_list_delete_link(cache_domains, item);
|
||||
|
@ -424,3 +397,28 @@ void clear_cpu_tree(void)
|
|||
core_count = 0;
|
||||
|
||||
}
|
||||
|
||||
static gint compare_cpus(gconstpointer a, gconstpointer b)
|
||||
{
|
||||
const struct topo_obj *ai = a;
|
||||
const struct topo_obj *bi = b;
|
||||
|
||||
return ai->number - bi->number;
|
||||
}
|
||||
|
||||
struct topo_obj *find_cpu_core(int cpunr)
|
||||
{
|
||||
GList *entry;
|
||||
struct topo_obj find;
|
||||
|
||||
find.number = cpunr;
|
||||
entry = g_list_find_custom(cpus, &find, compare_cpus);
|
||||
|
||||
return entry ? entry->data : NULL;
|
||||
}
|
||||
|
||||
int get_cpu_count(void)
|
||||
{
|
||||
return g_list_length(cpus);
|
||||
}
|
||||
|
||||
|
|
13
irqbalance.1
13
irqbalance.1
|
@ -41,6 +41,19 @@ Causes irqbalance to be run once, after which the daemon exits
|
|||
.B --debug
|
||||
Causes irqbalance to run in the foreground and extra debug information to be printed
|
||||
|
||||
.TP
|
||||
.B --hintpolicy=[exact | subset | ignore]
|
||||
Set the policy for how irq kernel affinity hinting is treated. Can be one of:
|
||||
.P
|
||||
.I exact
|
||||
irq affinity hint is applied unilaterally and never violated
|
||||
.P
|
||||
.I subset
|
||||
irq is balanced, but the assigned object will be a subset of the affintiy hint
|
||||
.P
|
||||
.I ignore
|
||||
irq affinity hint value is completely ignored
|
||||
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
.TP
|
||||
.B IRQBALANCE_ONESHOT
|
||||
|
|
128
irqbalance.c
128
irqbalance.c
|
@ -38,13 +38,11 @@
|
|||
int one_shot_mode;
|
||||
int debug_mode;
|
||||
int numa_avail;
|
||||
|
||||
int need_cpu_rescan;
|
||||
|
||||
extern cpumask_t banned_cpus;
|
||||
|
||||
static int counter;
|
||||
|
||||
enum hp_e hint_policy = HINT_POLICY_SUBSET;
|
||||
unsigned long power_thresh = ULONG_MAX;
|
||||
unsigned long long cycle_count = 0;
|
||||
|
||||
void sleep_approx(int seconds)
|
||||
{
|
||||
|
@ -64,12 +62,15 @@ void sleep_approx(int seconds)
|
|||
struct option lopts[] = {
|
||||
{"oneshot", 0, NULL, 'o'},
|
||||
{"debug", 0, NULL, 'd'},
|
||||
{"hintpolicy", 1, NULL, 'h'},
|
||||
{"powerthresh", 1, NULL, 'p'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
printf("irqbalance [--oneshot | -o] [--debug | -d]");
|
||||
printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]\n");
|
||||
printf(" [--powerthresh= | -p <off> | <n>]\n");
|
||||
}
|
||||
|
||||
static void parse_command_line(int argc, char **argv)
|
||||
|
@ -78,7 +79,7 @@ static void parse_command_line(int argc, char **argv)
|
|||
int longind;
|
||||
|
||||
while ((opt = getopt_long(argc, argv,
|
||||
"",
|
||||
"odh:p:",
|
||||
lopts, &longind)) != -1) {
|
||||
|
||||
switch(opt) {
|
||||
|
@ -88,6 +89,29 @@ static void parse_command_line(int argc, char **argv)
|
|||
case 'd':
|
||||
debug_mode=1;
|
||||
break;
|
||||
case 'h':
|
||||
if (!strncmp(optarg, "exact", strlen(optarg)))
|
||||
hint_policy = HINT_POLICY_EXACT;
|
||||
else if (!strncmp(optarg, "subset", strlen(optarg)))
|
||||
hint_policy = HINT_POLICY_SUBSET;
|
||||
else if (!strncmp(optarg, "ignore", strlen(optarg)))
|
||||
hint_policy = HINT_POLICY_IGNORE;
|
||||
else {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
case 'p':
|
||||
if (!strncmp(optarg, "off", strlen(optarg)))
|
||||
power_thresh = ULONG_MAX;
|
||||
else {
|
||||
power_thresh = strtoull(optarg, NULL, 10);
|
||||
if (power_thresh == ULONG_MAX) {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'o':
|
||||
one_shot_mode=1;
|
||||
break;
|
||||
|
@ -96,6 +120,50 @@ static void parse_command_line(int argc, char **argv)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This builds our object tree. The Heirarchy is pretty straightforward
|
||||
* At the top are numa_nodes
|
||||
* All CPU packages belong to a single numa_node
|
||||
* All Cache domains belong to a CPU package
|
||||
* All CPU cores belong to a cache domain
|
||||
*
|
||||
* Objects are built in that order (top down)
|
||||
*
|
||||
* Object workload is the aggregate sum of the
|
||||
* workload of the objects below it
|
||||
*/
|
||||
static void build_object_tree()
|
||||
{
|
||||
build_numa_node_list();
|
||||
parse_cpu_tree();
|
||||
rebuild_irq_db();
|
||||
}
|
||||
|
||||
static void free_object_tree()
|
||||
{
|
||||
free_numa_node_list();
|
||||
clear_cpu_tree();
|
||||
free_irq_db();
|
||||
}
|
||||
|
||||
static void dump_object_tree()
|
||||
{
|
||||
for_each_object(numa_nodes, dump_numa_node_info, NULL);
|
||||
}
|
||||
|
||||
static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
if (info->level == BALANCE_NONE)
|
||||
return;
|
||||
|
||||
if (info->assigned_obj == NULL)
|
||||
rebalance_irq_list = g_list_append(rebalance_irq_list, info);
|
||||
else
|
||||
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
|
||||
|
||||
info->assigned_obj = NULL;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
|
||||
|
@ -125,9 +193,9 @@ int main(int argc, char** argv)
|
|||
}
|
||||
|
||||
|
||||
rebuild_irq_db();
|
||||
|
||||
parse_cpu_tree();
|
||||
build_object_tree();
|
||||
if (debug_mode)
|
||||
dump_object_tree();
|
||||
|
||||
|
||||
/* On single core UP systems irqbalance obviously has no work to do */
|
||||
|
@ -150,15 +218,10 @@ int main(int argc, char** argv)
|
|||
capng_apply(CAPNG_SELECT_BOTH);
|
||||
#endif
|
||||
|
||||
for_each_irq(NULL, force_rebalance_irq, NULL);
|
||||
|
||||
parse_proc_interrupts();
|
||||
sleep(SLEEP_INTERVAL/4);
|
||||
reset_counts();
|
||||
parse_proc_interrupts();
|
||||
pci_numa_scan();
|
||||
calculate_workload();
|
||||
sort_irq_list();
|
||||
if (debug_mode)
|
||||
dump_workloads();
|
||||
parse_proc_stat();
|
||||
|
||||
while (1) {
|
||||
sleep_approx(SLEEP_INTERVAL);
|
||||
|
@ -166,8 +229,9 @@ int main(int argc, char** argv)
|
|||
printf("\n\n\n-----------------------------------------------------------------------------\n");
|
||||
|
||||
|
||||
check_power_mode();
|
||||
clear_work_stats();
|
||||
parse_proc_interrupts();
|
||||
parse_proc_stat();
|
||||
|
||||
/* cope with cpu hotplug -- detected during /proc/interrupts parsing */
|
||||
if (need_cpu_rescan) {
|
||||
|
@ -179,25 +243,31 @@ int main(int argc, char** argv)
|
|||
reset_counts();
|
||||
clear_work_stats();
|
||||
|
||||
clear_cpu_tree();
|
||||
parse_cpu_tree();
|
||||
free_object_tree();
|
||||
build_object_tree();
|
||||
for_each_irq(NULL, force_rebalance_irq, NULL);
|
||||
parse_proc_interrupts();
|
||||
parse_proc_stat();
|
||||
sleep_approx(SLEEP_INTERVAL);
|
||||
clear_work_stats();
|
||||
parse_proc_interrupts();
|
||||
parse_proc_stat();
|
||||
cycle_count=0;
|
||||
}
|
||||
|
||||
calculate_workload();
|
||||
|
||||
/* to cope with dynamic configurations we scan for new numa information
|
||||
* once every 5 minutes
|
||||
*/
|
||||
pci_numa_scan();
|
||||
if (cycle_count)
|
||||
update_migration_status();
|
||||
|
||||
calculate_placement();
|
||||
activate_mapping();
|
||||
activate_mappings();
|
||||
|
||||
if (debug_mode)
|
||||
dump_tree();
|
||||
if (one_shot_mode)
|
||||
break;
|
||||
counter++;
|
||||
cycle_count++;
|
||||
|
||||
}
|
||||
free_object_tree();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
|
103
irqbalance.h
103
irqbalance.h
|
@ -8,47 +8,116 @@
|
|||
|
||||
#include <stdint.h>
|
||||
#include <glib.h>
|
||||
#include <syslog.h>
|
||||
|
||||
#include "types.h"
|
||||
#include <numa.h>
|
||||
|
||||
struct interrupt;
|
||||
|
||||
extern int package_count;
|
||||
extern int cache_domain_count;
|
||||
extern int core_count;
|
||||
extern char *classes[];
|
||||
extern int map_class_to_level[7];
|
||||
extern int class_counts[7];
|
||||
extern int debug_mode;
|
||||
extern int power_mode;
|
||||
extern int need_cpu_rescan;
|
||||
extern int one_shot_mode;
|
||||
extern GList *interrupts;
|
||||
|
||||
extern void parse_cpu_tree(void);
|
||||
extern void clear_work_stats(void);
|
||||
extern void parse_proc_interrupts(void);
|
||||
extern void rebuild_irq_db(void);
|
||||
extern void parse_proc_stat(void);
|
||||
extern void set_interrupt_count(int number, uint64_t count);
|
||||
extern void set_msi_interrupt_numa(int number);
|
||||
extern int get_next_irq(int irq);
|
||||
extern int find_irq_integer_prop(int irq, enum irq_prop prop);
|
||||
extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop);
|
||||
|
||||
extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type);
|
||||
extern GList *rebalance_irq_list;
|
||||
|
||||
void calculate_workload(void);
|
||||
void update_migration_status(void);
|
||||
void reset_counts(void);
|
||||
void dump_workloads(void);
|
||||
void sort_irq_list(void);
|
||||
void sort_irq_list(GList **list);
|
||||
void calculate_placement(void);
|
||||
void dump_tree(void);
|
||||
|
||||
void activate_mapping(void);
|
||||
void activate_mappings(void);
|
||||
void account_for_nic_stats(void);
|
||||
void check_power_mode(void);
|
||||
void clear_cpu_tree(void);
|
||||
void pci_numa_scan(void);
|
||||
|
||||
/*===================NEW BALANCER FUNCTIONS============================*/
|
||||
|
||||
/*
|
||||
* Master topo_obj type lists
|
||||
*/
|
||||
extern GList *numa_nodes;
|
||||
extern GList *packages;
|
||||
extern GList *cache_domains;
|
||||
extern GList *cpus;
|
||||
|
||||
enum hp_e {
|
||||
HINT_POLICY_IGNORE,
|
||||
HINT_POLICY_SUBSET,
|
||||
HINT_POLICY_EXACT
|
||||
};
|
||||
|
||||
extern int debug_mode;
|
||||
extern int one_shot_mode;
|
||||
extern int power_mode;
|
||||
extern int need_cpu_rescan;
|
||||
extern enum hp_e hint_policy;
|
||||
extern unsigned long long cycle_count;
|
||||
extern unsigned long power_thresh;
|
||||
|
||||
/*
|
||||
* Numa node access routines
|
||||
*/
|
||||
extern void build_numa_node_list(void);
|
||||
extern void free_numa_node_list(void);
|
||||
extern void dump_numa_node_info(struct topo_obj *node, void *data);
|
||||
extern void add_package_to_node(struct topo_obj *p, int nodeid);
|
||||
extern struct topo_obj *get_numa_node(int nodeid);
|
||||
|
||||
/*
|
||||
* Package functions
|
||||
*/
|
||||
#define package_numa_node(p) ((p)->parent)
|
||||
|
||||
/*
|
||||
* cache_domain functions
|
||||
*/
|
||||
#define cache_domain_package(c) ((c)->parent)
|
||||
#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c))))
|
||||
|
||||
/*
|
||||
* cpu core functions
|
||||
*/
|
||||
#define cpu_cache_domain(cpu) ((cpu)->parent)
|
||||
#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu))))
|
||||
#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu)))))
|
||||
extern struct topo_obj *find_cpu_core(int cpunr);
|
||||
extern int get_cpu_count(void);
|
||||
|
||||
/*
|
||||
* irq db functions
|
||||
*/
|
||||
extern void rebuild_irq_db(void);
|
||||
extern void free_irq_db(void);
|
||||
extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data);
|
||||
extern struct irq_info *get_irq_info(int irq);
|
||||
extern void migrate_irq(GList **from, GList **to, struct irq_info *info);
|
||||
extern struct irq_info *add_misc_irq(int irq);
|
||||
#define irq_numa_node(irq) ((irq)->numa_node)
|
||||
|
||||
|
||||
/*
|
||||
* Generic object functions
|
||||
*/
|
||||
static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data)
|
||||
{
|
||||
GList *entry, *next;
|
||||
entry = g_list_first(list);
|
||||
while (entry) {
|
||||
next = g_list_next(entry);
|
||||
cb(entry->data, data);
|
||||
entry = next;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
380
irqlist.c
380
irqlist.c
|
@ -29,285 +29,183 @@
|
|||
#include <sys/types.h>
|
||||
#include <dirent.h>
|
||||
#include <errno.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "irqbalance.h"
|
||||
|
||||
GList *interrupts;
|
||||
|
||||
|
||||
struct load_balance_info {
|
||||
unsigned long long int total_load;
|
||||
unsigned long long avg_load;
|
||||
int load_sources;
|
||||
unsigned long long int deviations;
|
||||
long double std_deviation;
|
||||
unsigned int num_within;
|
||||
unsigned int num_over;
|
||||
unsigned int num_under;
|
||||
struct topo_obj *powersave;
|
||||
};
|
||||
|
||||
void get_affinity_hint(struct interrupt *irq, int number)
|
||||
static void gather_load_stats(struct topo_obj *obj, void *data)
|
||||
{
|
||||
char buf[PATH_MAX];
|
||||
cpumask_t tempmask;
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
FILE *file;
|
||||
sprintf(buf, "/proc/irq/%i/affinity_hint", number);
|
||||
file = fopen(buf, "r");
|
||||
if (!file)
|
||||
return;
|
||||
if (getline(&line, &size, file)==0) {
|
||||
free(line);
|
||||
fclose(file);
|
||||
return;
|
||||
}
|
||||
cpumask_parse_user(line, strlen(line), tempmask);
|
||||
if (!__cpus_full(&tempmask, num_possible_cpus()))
|
||||
irq->node_mask = tempmask;
|
||||
fclose(file);
|
||||
free(line);
|
||||
struct load_balance_info *info = data;
|
||||
|
||||
info->total_load += obj->load;
|
||||
info->load_sources += 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function classifies and reads various things from /proc about a specific irq
|
||||
*/
|
||||
static void investigate(struct interrupt *irq, int number)
|
||||
static void compute_deviations(struct topo_obj *obj, void *data)
|
||||
{
|
||||
DIR *dir;
|
||||
struct dirent *entry;
|
||||
char *c, *c2;
|
||||
int nr , count = 0, can_set = 1;
|
||||
char buf[PATH_MAX];
|
||||
sprintf(buf, "/proc/irq/%i", number);
|
||||
dir = opendir(buf);
|
||||
do {
|
||||
entry = readdir(dir);
|
||||
if (!entry)
|
||||
break;
|
||||
if (strcmp(entry->d_name,"smp_affinity")==0) {
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
FILE *file;
|
||||
sprintf(buf, "/proc/irq/%i/smp_affinity", number);
|
||||
file = fopen(buf, "r+");
|
||||
if (!file)
|
||||
continue;
|
||||
if (getline(&line, &size, file)==0) {
|
||||
free(line);
|
||||
fclose(file);
|
||||
continue;
|
||||
}
|
||||
cpumask_parse_user(line, strlen(line), irq->mask);
|
||||
struct load_balance_info *info = data;
|
||||
unsigned long long int deviation;
|
||||
|
||||
deviation = (obj->load > info->avg_load) ?
|
||||
obj->load - info->avg_load :
|
||||
info->avg_load - obj->load;
|
||||
|
||||
info->deviations += (deviation * deviation);
|
||||
}
|
||||
|
||||
static void move_candidate_irqs(struct irq_info *info, void *data)
|
||||
{
|
||||
int *remaining_deviation = (int *)data;
|
||||
|
||||
/* never move an irq that has an afinity hint when
|
||||
* hint_policy is HINT_POLICY_EXACT
|
||||
*/
|
||||
if (hint_policy == HINT_POLICY_EXACT)
|
||||
if (!cpus_empty(info->affinity_hint))
|
||||
return;
|
||||
|
||||
/* Don't rebalance irqs that don't want it */
|
||||
if (info->level == BALANCE_NONE)
|
||||
return;
|
||||
|
||||
/* Don't move cpus that only have one irq, regardless of load */
|
||||
if (g_list_length(info->assigned_obj->interrupts) <= 1)
|
||||
return;
|
||||
|
||||
/* Stop rebalancing if we've estimated a full reduction of deviation */
|
||||
if (*remaining_deviation <= 0)
|
||||
return;
|
||||
|
||||
*remaining_deviation -= info->load;
|
||||
|
||||
if (debug_mode)
|
||||
printf("Selecting irq %d for rebalancing\n", info->irq);
|
||||
|
||||
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
|
||||
|
||||
info->assigned_obj = NULL;
|
||||
}
|
||||
|
||||
static void migrate_overloaded_irqs(struct topo_obj *obj, void *data)
|
||||
{
|
||||
struct load_balance_info *info = data;
|
||||
int deviation;
|
||||
|
||||
/*
|
||||
* Check that we can write the affinity, if
|
||||
* not take it out of the list.
|
||||
* Don't rebalance irqs on objects whos load is below the average
|
||||
*/
|
||||
fputs(line, file);
|
||||
if (fclose(file) && errno == EIO)
|
||||
can_set = 0;
|
||||
free(line);
|
||||
} else if (strcmp(entry->d_name,"allowed_affinity")==0) {
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
FILE *file;
|
||||
sprintf(buf, "/proc/irq/%i/allowed_affinity", number);
|
||||
file = fopen(buf, "r");
|
||||
if (!file)
|
||||
continue;
|
||||
if (getline(&line, &size, file)==0) {
|
||||
free(line);
|
||||
fclose(file);
|
||||
continue;
|
||||
}
|
||||
cpumask_parse_user(line, strlen(line), irq->allowed_mask);
|
||||
fclose(file);
|
||||
free(line);
|
||||
} else if (strcmp(entry->d_name,"affinity_hint")==0) {
|
||||
get_affinity_hint(irq, number);
|
||||
} else {
|
||||
irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS);
|
||||
}
|
||||
|
||||
} while (entry);
|
||||
closedir(dir);
|
||||
irq->balance_level = map_class_to_level[irq->class];
|
||||
|
||||
for (nr = 0; nr < NR_CPUS; nr++)
|
||||
if (cpu_isset(nr, irq->allowed_mask))
|
||||
count++;
|
||||
|
||||
/* if there is no choice in the allowed mask, don't bother to balance */
|
||||
if ((count<2) || (can_set == 0))
|
||||
irq->balance_level = BALANCE_NONE;
|
||||
|
||||
|
||||
/* next, check the IRQBALANCE_BANNED_INTERRUPTS env variable for blacklisted irqs */
|
||||
c = c2 = getenv("IRQBALANCE_BANNED_INTERRUPTS");
|
||||
if (!c)
|
||||
if (obj->load <= info->avg_load) {
|
||||
if ((obj->load + info->std_deviation) <= info->avg_load) {
|
||||
info->num_under++;
|
||||
info->powersave = obj;
|
||||
} else
|
||||
info->num_within++;
|
||||
return;
|
||||
}
|
||||
|
||||
deviation = obj->load - info->avg_load;
|
||||
|
||||
if ((deviation > info->std_deviation) &&
|
||||
(g_list_length(obj->interrupts) > 1)) {
|
||||
|
||||
info->num_over++;
|
||||
/*
|
||||
* We have a cpu that is overloaded and
|
||||
* has irqs that can be moved to fix that
|
||||
*/
|
||||
|
||||
/* order the list from least to greatest workload */
|
||||
sort_irq_list(&obj->interrupts);
|
||||
/*
|
||||
* Each irq carries a weighted average amount of load
|
||||
* we think its responsible for. Set deviation to be the load
|
||||
* of the difference between this objects load and the averate,
|
||||
* and migrate irqs until we only have one left, or until that
|
||||
* difference reaches zero
|
||||
*/
|
||||
for_each_irq(obj->interrupts, move_candidate_irqs, &deviation);
|
||||
} else
|
||||
info->num_within++;
|
||||
|
||||
do {
|
||||
c = c2;
|
||||
nr = strtoul(c, &c2, 10);
|
||||
if (c!=c2 && nr == number)
|
||||
irq->balance_level = BALANCE_NONE;
|
||||
} while (c!=c2 && c2!=NULL);
|
||||
}
|
||||
|
||||
/* Set numa node number for MSI interrupt;
|
||||
* Assumes existing irq metadata
|
||||
*/
|
||||
void set_msi_interrupt_numa(int number)
|
||||
static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
int node;
|
||||
|
||||
node = find_irq_integer_prop(number, IRQ_NUMA);
|
||||
if (node < 0)
|
||||
return;
|
||||
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
|
||||
if (irq->number == number) {
|
||||
irq->node_num = node;
|
||||
irq->msi = 1;
|
||||
return;
|
||||
}
|
||||
item = g_list_next(item);
|
||||
}
|
||||
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the number of interrupts received for a specific irq;
|
||||
* create the irq metadata if there is none yet
|
||||
*/
|
||||
void set_interrupt_count(int number, uint64_t count)
|
||||
static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused)))
|
||||
{
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
|
||||
if (count < MIN_IRQ_COUNT && !one_shot_mode)
|
||||
return; /* no need to track or set interrupts sources without any activity since boot
|
||||
but allow for a few (20) boot-time-only interrupts */
|
||||
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
|
||||
if (irq->number == number) {
|
||||
irq->count = count;
|
||||
/* see if affinity_hint changed */
|
||||
get_affinity_hint(irq, number);
|
||||
return;
|
||||
}
|
||||
item = g_list_next(item);
|
||||
}
|
||||
/* new interrupt */
|
||||
irq = malloc(sizeof(struct interrupt));
|
||||
if (!irq)
|
||||
return;
|
||||
memset(irq, 0, sizeof(struct interrupt));
|
||||
irq->node_num = -1;
|
||||
irq->number = number;
|
||||
irq->count = count;
|
||||
irq->allowed_mask = CPU_MASK_ALL;
|
||||
investigate(irq, number);
|
||||
interrupts = g_list_append(interrupts, irq);
|
||||
obj->powersave_mode = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the numa affinity mask for a specific interrupt if there
|
||||
* is metadata for the interrupt; do nothing if no such data
|
||||
* exists.
|
||||
*/
|
||||
void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type)
|
||||
#define find_overloaded_objs(name, info) do {\
|
||||
int ___load_sources;\
|
||||
memset(&(info), 0, sizeof(struct load_balance_info));\
|
||||
for_each_object((name), gather_load_stats, &(info));\
|
||||
(info).avg_load = (info).total_load / (info).load_sources;\
|
||||
for_each_object((name), compute_deviations, &(info));\
|
||||
___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\
|
||||
(info).std_deviation = (long double)((info).deviations / ___load_sources);\
|
||||
(info).std_deviation = sqrt((info).std_deviation);\
|
||||
for_each_object((name), migrate_overloaded_irqs, &(info));\
|
||||
}while(0)
|
||||
|
||||
void update_migration_status(void)
|
||||
{
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
struct load_balance_info info;
|
||||
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
item = g_list_next(item);
|
||||
|
||||
if (irq->number == number) {
|
||||
cpus_or(irq->numa_mask, irq->numa_mask, mask);
|
||||
irq->node_num = node_num;
|
||||
if (irq->class < type && irq->balance_level != BALANCE_NONE) {
|
||||
irq->class = type;
|
||||
irq->balance_level = map_class_to_level[irq->class];
|
||||
}
|
||||
return;
|
||||
find_overloaded_objs(cpus, info);
|
||||
if (cycle_count > 5) {
|
||||
if (!info.num_over && (info.num_under >= power_thresh)) {
|
||||
syslog(LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number);
|
||||
info.powersave->powersave_mode = 1;
|
||||
for_each_irq(info.powersave->interrupts, force_irq_migration, NULL);
|
||||
} else if (info.num_over) {
|
||||
syslog(LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n");
|
||||
for_each_object(cpus, clear_powersave_mode, NULL);
|
||||
}
|
||||
}
|
||||
find_overloaded_objs(cache_domains, info);
|
||||
find_overloaded_objs(packages, info);
|
||||
find_overloaded_objs(numa_nodes, info);
|
||||
}
|
||||
|
||||
void calculate_workload(void)
|
||||
|
||||
static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused)))
|
||||
{
|
||||
int i;
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
|
||||
for (i=0; i<7; i++)
|
||||
class_counts[i]=0;
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
item = g_list_next(item);
|
||||
|
||||
irq->workload = irq->count - irq->old_count + irq->workload/3 + irq->extra;
|
||||
class_counts[irq->class]++;
|
||||
irq->old_count = irq->count;
|
||||
irq->extra = 0;
|
||||
}
|
||||
info->last_irq_count = info->irq_count;
|
||||
info->irq_count = 0;
|
||||
}
|
||||
|
||||
void reset_counts(void)
|
||||
{
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
item = g_list_next(item);
|
||||
irq->old_count = irq->count;
|
||||
irq->extra = 0;
|
||||
for_each_irq(NULL, reset_irq_count, NULL);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void dump_workload(struct irq_info *info, void *unused __attribute__((unused)))
|
||||
{
|
||||
printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load);
|
||||
}
|
||||
|
||||
void dump_workloads(void)
|
||||
{
|
||||
GList *item;
|
||||
struct interrupt *irq;
|
||||
item = g_list_first(interrupts);
|
||||
while (item) {
|
||||
irq = item->data;
|
||||
item = g_list_next(item);
|
||||
|
||||
printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload);
|
||||
|
||||
}
|
||||
for_each_irq(NULL, dump_workload, NULL);
|
||||
}
|
||||
|
||||
|
||||
static gint sort_irqs(gconstpointer A, gconstpointer B)
|
||||
{
|
||||
struct interrupt *a, *b;
|
||||
a = (struct interrupt*)A;
|
||||
b = (struct interrupt*)B;
|
||||
|
||||
if (a->class < b->class)
|
||||
return 1;
|
||||
if (a->class > b->class)
|
||||
return -1;
|
||||
if (a->workload < b->workload)
|
||||
return 1;
|
||||
if (a->workload > b->workload)
|
||||
return -1;
|
||||
if (a<b)
|
||||
return 1;
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
void sort_irq_list(void)
|
||||
{
|
||||
/* sort by class first (high->low) and then by workload (high->low) */
|
||||
interrupts = g_list_sort(interrupts, sort_irqs);
|
||||
}
|
||||
|
|
142
numa.c
142
numa.c
|
@ -33,24 +33,130 @@
|
|||
|
||||
#include "irqbalance.h"
|
||||
|
||||
void pci_numa_scan(void)
|
||||
#define SYSFS_NODE_PATH "/sys/devices/system/node"
|
||||
|
||||
GList *numa_nodes = NULL;
|
||||
|
||||
struct topo_obj unspecified_node = {
|
||||
.load = 0,
|
||||
.number = -1,
|
||||
.obj_type = OBJ_TYPE_NODE,
|
||||
.mask = CPU_MASK_ALL,
|
||||
.interrupts = NULL,
|
||||
.children = NULL,
|
||||
.parent = NULL,
|
||||
.obj_type_list = &numa_nodes,
|
||||
};
|
||||
|
||||
static void add_one_node(const char *nodename)
|
||||
{
|
||||
int irq = -1;
|
||||
cpumask_t mask;
|
||||
int node_num;
|
||||
do {
|
||||
int type;
|
||||
irq = get_next_irq(irq);
|
||||
if (irq == -1)
|
||||
break;
|
||||
char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1);
|
||||
struct topo_obj *new;
|
||||
char *cpustr;
|
||||
FILE *f;
|
||||
|
||||
mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK);
|
||||
|
||||
node_num = find_irq_integer_prop(irq, IRQ_NUMA);
|
||||
|
||||
type = find_irq_integer_prop(irq, IRQ_CLASS);
|
||||
|
||||
add_interrupt_numa(irq, mask, node_num, type);
|
||||
|
||||
} while (irq != -1);
|
||||
if (!path)
|
||||
return;
|
||||
new = calloc(1, sizeof(struct topo_obj));
|
||||
if (!new)
|
||||
return;
|
||||
sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename);
|
||||
f = fopen(path, "r");
|
||||
if (ferror(f)) {
|
||||
cpus_clear(new->mask);
|
||||
} else {
|
||||
fscanf(f, "%as", &cpustr);
|
||||
if (!cpustr) {
|
||||
cpus_clear(new->mask);
|
||||
} else {
|
||||
cpumask_parse_user(cpustr, strlen(cpustr), new->mask);
|
||||
free(cpustr);
|
||||
}
|
||||
}
|
||||
new->obj_type = OBJ_TYPE_NODE;
|
||||
new->number = strtoul(&nodename[4], NULL, 10);
|
||||
new->obj_type_list = &numa_nodes;
|
||||
numa_nodes = g_list_append(numa_nodes, new);
|
||||
}
|
||||
|
||||
void build_numa_node_list(void)
|
||||
{
|
||||
DIR *dir = opendir(SYSFS_NODE_PATH);
|
||||
struct dirent *entry;
|
||||
|
||||
do {
|
||||
entry = readdir(dir);
|
||||
if (!entry)
|
||||
break;
|
||||
if ((entry->d_type == DT_DIR) && (strstr(entry->d_name, "node"))) {
|
||||
add_one_node(entry->d_name);
|
||||
}
|
||||
} while (entry);
|
||||
}
|
||||
|
||||
static void free_numa_node(gpointer data)
|
||||
{
|
||||
free(data);
|
||||
}
|
||||
|
||||
void free_numa_node_list(void)
|
||||
{
|
||||
g_list_free_full(numa_nodes, free_numa_node);
|
||||
numa_nodes = NULL;
|
||||
}
|
||||
|
||||
static gint compare_node(gconstpointer a, gconstpointer b)
|
||||
{
|
||||
const struct topo_obj *ai = a;
|
||||
const struct topo_obj *bi = b;
|
||||
|
||||
return (ai->number == bi->number) ? 0 : 1;
|
||||
}
|
||||
|
||||
void add_package_to_node(struct topo_obj *p, int nodeid)
|
||||
{
|
||||
struct topo_obj find, *node;
|
||||
find.number = nodeid;
|
||||
GList *entry;
|
||||
|
||||
find.number = nodeid;
|
||||
entry = g_list_find_custom(numa_nodes, &find, compare_node);
|
||||
|
||||
if (!entry) {
|
||||
if (debug_mode)
|
||||
printf("Could not find numa node for node id %d\n", nodeid);
|
||||
return;
|
||||
}
|
||||
|
||||
node = entry->data;
|
||||
|
||||
if (!p->parent) {
|
||||
node->children = g_list_append(node->children, p);
|
||||
p->parent = node;
|
||||
}
|
||||
}
|
||||
|
||||
void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused)))
|
||||
{
|
||||
char buffer[4096];
|
||||
|
||||
printf("NUMA NODE NUMBER: %d\n", d->number);
|
||||
cpumask_scnprintf(buffer, 4096, d->mask);
|
||||
printf("LOCAL CPU MASK: %s\n", buffer);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
struct topo_obj *get_numa_node(int nodeid)
|
||||
{
|
||||
struct topo_obj find;
|
||||
GList *entry;
|
||||
|
||||
if (nodeid == -1)
|
||||
return &unspecified_node;
|
||||
|
||||
find.number = nodeid;
|
||||
|
||||
entry = g_list_find_custom(numa_nodes, &find, compare_node);
|
||||
return entry ? entry->data : NULL;
|
||||
}
|
||||
|
||||
|
|
430
placement.c
430
placement.c
|
@ -30,355 +30,167 @@
|
|||
|
||||
int power_mode;
|
||||
|
||||
extern GList *interrupts, *packages, *cache_domains, *cpus;
|
||||
GList *rebalance_irq_list;
|
||||
|
||||
static uint64_t package_cost_func(struct interrupt *irq, struct package *package)
|
||||
struct obj_placement {
|
||||
struct topo_obj *best;
|
||||
struct topo_obj *least_irqs;
|
||||
uint64_t best_cost;
|
||||
struct irq_info *info;
|
||||
};
|
||||
|
||||
static void find_best_object(struct topo_obj *d, void *data)
|
||||
{
|
||||
int bonus = 0;
|
||||
int maxcount;
|
||||
int dist;
|
||||
/* moving to a cold package/cache/etc gets you a 3000 penalty */
|
||||
if (!cpus_intersects(irq->old_mask, package->mask))
|
||||
bonus = CROSS_PACKAGE_PENALTY;
|
||||
|
||||
/* do a little numa affinity */
|
||||
if (irq->node_num != package->node_num) {
|
||||
if (irq->node_num >= 0 && package->node_num >= 0) {
|
||||
dist = numa_distance(irq->node_num, package->node_num);
|
||||
/* moving to a distant numa node results into penalty */
|
||||
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* but if the irq has had 0 interrupts for a while move it about more easily */
|
||||
if (irq->workload==0)
|
||||
bonus = bonus / 10;
|
||||
|
||||
/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
|
||||
if (power_mode)
|
||||
bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number;
|
||||
|
||||
/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
|
||||
maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
|
||||
if (package->class_count[irq->class]>=maxcount && !power_mode)
|
||||
bonus += 300000;
|
||||
|
||||
/* if the package has no cpus in the allowed mask.. just block */
|
||||
if (!cpus_intersects(irq->allowed_mask, package->mask))
|
||||
bonus += 600000;
|
||||
|
||||
return irq->workload + bonus;
|
||||
}
|
||||
|
||||
static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
|
||||
{
|
||||
int bonus = 0;
|
||||
int dist;
|
||||
|
||||
/* moving to a cold cache gets you a 1500 penalty */
|
||||
if (!cpus_intersects(irq->old_mask, cache_domain->mask))
|
||||
bonus = CROSS_PACKAGE_PENALTY/2;
|
||||
|
||||
/* do a little numa affinity */
|
||||
if (irq->node_num != cache_domain->node_num) {
|
||||
if (irq->node_num >= 0 && cache_domain->node_num >= 0) {
|
||||
dist = numa_distance(irq->node_num, cache_domain->node_num);
|
||||
/* moving to a distant numa node results into penalty */
|
||||
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* but if the irq has had 0 interrupts for a while move it about more easily */
|
||||
if (irq->workload==0)
|
||||
bonus = bonus / 10;
|
||||
|
||||
|
||||
/* pay 6000 for each previous interrupt of the same class */
|
||||
bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
|
||||
|
||||
/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
|
||||
* cache domain */
|
||||
if (irq->msi == 1)
|
||||
bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
|
||||
|
||||
/* if the cache domain has no cpus in the allowed mask.. just block */
|
||||
if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
|
||||
bonus += 600000;
|
||||
|
||||
return irq->workload + bonus;
|
||||
}
|
||||
|
||||
static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
|
||||
{
|
||||
int bonus = 0;
|
||||
int dist;
|
||||
|
||||
/* moving to a colder core gets you a 1000 penalty */
|
||||
if (!cpus_intersects(irq->old_mask, cpu->mask))
|
||||
bonus = CROSS_PACKAGE_PENALTY/3;
|
||||
|
||||
/* do a little numa affinity */
|
||||
if (irq->node_num != cpu->node_num) {
|
||||
if (irq->node_num >= 0 && cpu->node_num >= 0) {
|
||||
dist = numa_distance(irq->node_num, cpu->node_num);
|
||||
/* moving to a distant numa node results into penalty */
|
||||
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* but if the irq has had 0 interrupts for a while move it about more easily */
|
||||
if (irq->workload==0)
|
||||
bonus = bonus / 10;
|
||||
struct obj_placement *best = (struct obj_placement *)data;
|
||||
uint64_t newload;
|
||||
cpumask_t subset;
|
||||
|
||||
/*
|
||||
* since some chipsets only place at the first cpu, give a tiny preference to non-first
|
||||
* cpus for specifically placed interrupts
|
||||
* If the hint policy is subset, then we only want
|
||||
* to consider objects that are within the irqs hint, but
|
||||
* only if that irq in fact has published a hint
|
||||
*/
|
||||
if (first_cpu(cpu->cache_mask)==cpu->number)
|
||||
bonus++;
|
||||
if (hint_policy == HINT_POLICY_SUBSET) {
|
||||
if (!cpus_empty(best->info->affinity_hint)) {
|
||||
cpus_and(subset, best->info->affinity_hint, d->mask);
|
||||
if (cpus_empty(subset))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* pay 6000 for each previous interrupt of the same class */
|
||||
bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
|
||||
if (d->powersave_mode)
|
||||
return;
|
||||
|
||||
/* if the core has no cpus in the allowed mask.. just block */
|
||||
if (!cpus_intersects(irq->allowed_mask, cpu->mask))
|
||||
bonus += 600000;
|
||||
newload = d->load;
|
||||
if (newload < best->best_cost) {
|
||||
best->best = d;
|
||||
best->best_cost = newload;
|
||||
best->least_irqs = NULL;
|
||||
}
|
||||
|
||||
return irq->workload + bonus;
|
||||
if (newload == best->best_cost) {
|
||||
if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts))
|
||||
best->least_irqs = d;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void place_cache_domain(struct package *package)
|
||||
static void find_best_object_for_irq(struct irq_info *info, void *data)
|
||||
{
|
||||
GList *iter, *next;
|
||||
GList *pkg;
|
||||
struct interrupt *irq;
|
||||
struct cache_domain *cache_domain;
|
||||
struct obj_placement place;
|
||||
struct topo_obj *d = data;
|
||||
struct topo_obj *asign;
|
||||
|
||||
if (!info->moved)
|
||||
return;
|
||||
|
||||
iter = g_list_first(package->interrupts);
|
||||
while (iter) {
|
||||
struct cache_domain *best = NULL;
|
||||
uint64_t best_cost = INT_MAX;
|
||||
irq = iter->data;
|
||||
switch (d->obj_type) {
|
||||
case OBJ_TYPE_NODE:
|
||||
if (info->level == BALANCE_NONE)
|
||||
return;
|
||||
break;
|
||||
|
||||
if (irq->balance_level <= BALANCE_PACKAGE) {
|
||||
iter = g_list_next(iter);
|
||||
continue;
|
||||
}
|
||||
pkg = g_list_first(package->cache_domains);
|
||||
while (pkg) {
|
||||
uint64_t newload;
|
||||
case OBJ_TYPE_PACKAGE:
|
||||
if (info->level == BALANCE_PACKAGE)
|
||||
return;
|
||||
break;
|
||||
|
||||
cache_domain = pkg->data;
|
||||
newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain);
|
||||
if (newload < best_cost) {
|
||||
best = cache_domain;
|
||||
best_cost = newload;
|
||||
case OBJ_TYPE_CACHE:
|
||||
if (info->level == BALANCE_CACHE)
|
||||
return;
|
||||
break;
|
||||
|
||||
case OBJ_TYPE_CPU:
|
||||
if (info->level == BALANCE_CORE)
|
||||
return;
|
||||
break;
|
||||
}
|
||||
|
||||
pkg = g_list_next(pkg);
|
||||
}
|
||||
if (best) {
|
||||
next = g_list_next(iter);
|
||||
package->interrupts = g_list_delete_link(package->interrupts, iter);
|
||||
place.info = info;
|
||||
place.best = NULL;
|
||||
place.least_irqs = NULL;
|
||||
place.best_cost = INT_MAX;
|
||||
|
||||
best->workload += irq->workload + 1;
|
||||
best->interrupts=g_list_append(best->interrupts, irq);
|
||||
best->class_count[irq->class]++;
|
||||
irq->mask = best->mask;
|
||||
iter = next;
|
||||
} else
|
||||
iter = g_list_next(iter);
|
||||
for_each_object(d->children, find_best_object, &place);
|
||||
|
||||
asign = place.least_irqs ? place.least_irqs : place.best;
|
||||
|
||||
if (asign) {
|
||||
migrate_irq(&d->interrupts, &asign->interrupts, info);
|
||||
info->assigned_obj = asign;
|
||||
asign->load += info->load;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void place_core(struct cache_domain *cache_domain)
|
||||
static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused)))
|
||||
{
|
||||
GList *iter, *next;
|
||||
GList *pkg;
|
||||
struct interrupt *irq;
|
||||
struct cpu_core *cpu;
|
||||
|
||||
|
||||
iter = g_list_first(cache_domain->interrupts);
|
||||
while (iter) {
|
||||
struct cpu_core *best = NULL;
|
||||
uint64_t best_cost = INT_MAX;
|
||||
irq = iter->data;
|
||||
|
||||
/* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */
|
||||
if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) {
|
||||
iter = g_list_next(iter);
|
||||
continue;
|
||||
}
|
||||
pkg = g_list_first(cache_domain->cpu_cores);
|
||||
while (pkg) {
|
||||
uint64_t newload;
|
||||
|
||||
cpu = pkg->data;
|
||||
newload = cpu->workload + cpu_cost_func(irq, cpu);
|
||||
if (newload < best_cost) {
|
||||
best = cpu;
|
||||
best_cost = newload;
|
||||
}
|
||||
|
||||
pkg = g_list_next(pkg);
|
||||
}
|
||||
if (best) {
|
||||
next = g_list_next(iter);
|
||||
cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter);
|
||||
|
||||
best->workload += irq->workload + 1;
|
||||
best->interrupts=g_list_append(best->interrupts, irq);
|
||||
best->class_count[irq->class]++;
|
||||
irq->mask = best->mask;
|
||||
iter = next;
|
||||
} else
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
if (g_list_length(d->interrupts) > 0)
|
||||
for_each_irq(d->interrupts, find_best_object_for_irq, d);
|
||||
}
|
||||
|
||||
|
||||
static void place_packages(GList *list)
|
||||
static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused)))
|
||||
{
|
||||
GList *iter;
|
||||
GList *pkg;
|
||||
struct interrupt *irq;
|
||||
struct package *package;
|
||||
struct obj_placement place;
|
||||
struct topo_obj *asign;
|
||||
|
||||
if( info->level == BALANCE_NONE)
|
||||
return;
|
||||
|
||||
iter = g_list_first(list);
|
||||
while (iter) {
|
||||
struct package *best = NULL;
|
||||
uint64_t best_cost = INT_MAX;
|
||||
irq = iter->data;
|
||||
if (irq->balance_level == BALANCE_NONE) {
|
||||
iter = g_list_next(iter);
|
||||
continue;
|
||||
}
|
||||
pkg = g_list_first(packages);
|
||||
while (pkg) {
|
||||
uint64_t newload;
|
||||
|
||||
package = pkg->data;
|
||||
newload = package->workload + package_cost_func(irq, package);
|
||||
if (newload < best_cost) {
|
||||
best = package;
|
||||
best_cost = newload;
|
||||
if (irq_numa_node(info)->number != -1) {
|
||||
/*
|
||||
* This irq belongs to a device with a preferred numa node
|
||||
* put it on that node
|
||||
*/
|
||||
migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info);
|
||||
info->assigned_obj = irq_numa_node(info);
|
||||
irq_numa_node(info)->load += info->load + 1;
|
||||
return;
|
||||
}
|
||||
|
||||
pkg = g_list_next(pkg);
|
||||
}
|
||||
if (best) {
|
||||
best->workload += irq->workload + 1;
|
||||
best->interrupts=g_list_append(best->interrupts, irq);
|
||||
best->class_count[irq->class]++;
|
||||
irq->mask = best->mask;
|
||||
}
|
||||
iter = g_list_next(iter);
|
||||
place.best_cost = INT_MAX;
|
||||
place.best = NULL;
|
||||
place.least_irqs = NULL;
|
||||
place.info = info;
|
||||
|
||||
for_each_object(numa_nodes, find_best_object, &place);
|
||||
|
||||
asign = place.least_irqs ? place.least_irqs : place.best;
|
||||
|
||||
if (asign) {
|
||||
migrate_irq(&rebalance_irq_list, &asign->interrupts, info);
|
||||
info->assigned_obj = asign;
|
||||
asign->load += info->load;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void place_affinity_hint(GList *list)
|
||||
static void validate_irq(struct irq_info *info, void *data)
|
||||
{
|
||||
/* still need to balance best workload within the affinity_hint mask */
|
||||
GList *iter;
|
||||
struct interrupt *irq;
|
||||
|
||||
iter = g_list_first(list);
|
||||
while (iter) {
|
||||
irq = iter->data;
|
||||
if (irq->balance_level == BALANCE_NONE) {
|
||||
iter = g_list_next(iter);
|
||||
continue;
|
||||
}
|
||||
if ((!cpus_empty(irq->node_mask)) &&
|
||||
(!cpus_equal(irq->mask, irq->node_mask)) &&
|
||||
(!__cpus_full(&irq->node_mask, num_possible_cpus()))) {
|
||||
irq->old_mask = irq->mask;
|
||||
irq->mask = irq->node_mask;
|
||||
}
|
||||
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
if (info->assigned_obj != data)
|
||||
printf("object validation error: irq %d is wrong, points to %p, should be %p\n",
|
||||
info->irq, info->assigned_obj, data);
|
||||
}
|
||||
|
||||
|
||||
static void do_unroutables(void)
|
||||
static void validate_object(struct topo_obj *d, void *data __attribute__((unused)))
|
||||
{
|
||||
struct package *package;
|
||||
struct cache_domain *cache_domain;
|
||||
struct cpu_core *cpu;
|
||||
struct interrupt *irq;
|
||||
GList *iter, *inter;
|
||||
|
||||
inter = g_list_first(interrupts);
|
||||
while (inter) {
|
||||
irq = inter->data;
|
||||
inter = g_list_next(inter);
|
||||
if (irq->balance_level != BALANCE_NONE)
|
||||
continue;
|
||||
|
||||
iter = g_list_first(packages);
|
||||
while (iter) {
|
||||
package = iter->data;
|
||||
if (cpus_intersects(package->mask, irq->node_mask) ||
|
||||
cpus_intersects(package->mask, irq->mask))
|
||||
package->workload += irq->workload;
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
|
||||
iter = g_list_first(cache_domains);
|
||||
while (iter) {
|
||||
cache_domain = iter->data;
|
||||
if (cpus_intersects(cache_domain->mask, irq->node_mask)
|
||||
|| cpus_intersects(cache_domain->mask, irq->mask))
|
||||
cache_domain->workload += irq->workload;
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
iter = g_list_first(cpus);
|
||||
while (iter) {
|
||||
cpu = iter->data;
|
||||
if (cpus_intersects(cpu->mask, irq->node_mask) ||
|
||||
cpus_intersects(cpu->mask, irq->mask))
|
||||
cpu->workload += irq->workload;
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
}
|
||||
if (d->interrupts)
|
||||
for_each_irq(d->interrupts, validate_irq, d);
|
||||
}
|
||||
|
||||
static void validate_object_tree_placement()
|
||||
{
|
||||
for_each_object(packages, validate_object, NULL);
|
||||
for_each_object(cache_domains, validate_object, NULL);
|
||||
for_each_object(cpus, validate_object, NULL);
|
||||
}
|
||||
|
||||
void calculate_placement(void)
|
||||
{
|
||||
struct package *package;
|
||||
struct cache_domain *cache_domain;
|
||||
GList *iter;
|
||||
/* first clear old data */
|
||||
clear_work_stats();
|
||||
sort_irq_list();
|
||||
do_unroutables();
|
||||
|
||||
place_packages(interrupts);
|
||||
iter = g_list_first(packages);
|
||||
while (iter) {
|
||||
package = iter->data;
|
||||
place_cache_domain(package);
|
||||
iter = g_list_next(iter);
|
||||
sort_irq_list(&rebalance_irq_list);
|
||||
if (g_list_length(rebalance_irq_list) > 0) {
|
||||
for_each_irq(rebalance_irq_list, place_irq_in_node, NULL);
|
||||
for_each_object(numa_nodes, place_irq_in_object, NULL);
|
||||
for_each_object(packages, place_irq_in_object, NULL);
|
||||
for_each_object(cache_domains, place_irq_in_object, NULL);
|
||||
}
|
||||
|
||||
iter = g_list_first(cache_domains);
|
||||
while (iter) {
|
||||
cache_domain = iter->data;
|
||||
place_core(cache_domain);
|
||||
iter = g_list_next(iter);
|
||||
}
|
||||
/*
|
||||
* if affinity_hint is populated on irq and is not set to
|
||||
* all CPUs (meaning it's initialized), honor that above
|
||||
* anything in the package locality/workload.
|
||||
*/
|
||||
place_affinity_hint(interrupts);
|
||||
if (debug_mode)
|
||||
validate_object_tree_placement();
|
||||
}
|
||||
|
|
47
powermode.c
47
powermode.c
|
@ -28,54 +28,7 @@
|
|||
#include "irqbalance.h"
|
||||
|
||||
|
||||
extern int power_mode;
|
||||
|
||||
static uint64_t previous;
|
||||
|
||||
static unsigned int hysteresis;
|
||||
|
||||
void check_power_mode(void)
|
||||
{
|
||||
FILE *file;
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
char *c;
|
||||
uint64_t dummy __attribute__((unused));
|
||||
uint64_t irq, softirq;
|
||||
file = fopen("/proc/stat", "r");
|
||||
if (!file)
|
||||
return;
|
||||
if (getline(&line, &size, file)==0)
|
||||
size=0;
|
||||
fclose(file);
|
||||
if (!line)
|
||||
return;
|
||||
c=&line[4];
|
||||
dummy = strtoull(c, &c, 10); /* user */
|
||||
dummy = strtoull(c, &c, 10); /* nice */
|
||||
dummy = strtoull(c, &c, 10); /* system */
|
||||
dummy = strtoull(c, &c, 10); /* idle */
|
||||
dummy = strtoull(c, &c, 10); /* iowait */
|
||||
irq = strtoull(c, &c, 10); /* irq */
|
||||
softirq = strtoull(c, &c, 10); /* softirq */
|
||||
|
||||
|
||||
irq += softirq;
|
||||
printf("IRQ delta is %lu \n", (unsigned long)(irq - previous) );
|
||||
if (irq - previous < POWER_MODE_SOFTIRQ_THRESHOLD) {
|
||||
hysteresis++;
|
||||
if (hysteresis > POWER_MODE_HYSTERESIS) {
|
||||
if (debug_mode && !power_mode)
|
||||
printf("IRQ delta is %lu, switching to power mode \n", (unsigned long)(irq - previous) );
|
||||
power_mode = 1;
|
||||
}
|
||||
} else {
|
||||
if (debug_mode && power_mode)
|
||||
printf("IRQ delta is %lu, switching to performance mode \n", (unsigned long)(irq - previous) );
|
||||
power_mode = 0;
|
||||
hysteresis = 0;
|
||||
}
|
||||
previous = irq;
|
||||
free(line);
|
||||
}
|
||||
|
||||
|
|
160
procinterrupts.c
160
procinterrupts.c
|
@ -25,6 +25,7 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <syslog.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "cpumask.h"
|
||||
#include "irqbalance.h"
|
||||
|
@ -39,7 +40,6 @@ void parse_proc_interrupts(void)
|
|||
FILE *file;
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
int int_type;
|
||||
|
||||
file = fopen("/proc/interrupts", "r");
|
||||
if (!file)
|
||||
|
@ -48,6 +48,7 @@ void parse_proc_interrupts(void)
|
|||
/* first line is the header we don't need; nuke it */
|
||||
if (getline(&line, &size, file)==0) {
|
||||
free(line);
|
||||
fclose(file);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -56,6 +57,7 @@ void parse_proc_interrupts(void)
|
|||
int number;
|
||||
uint64_t count;
|
||||
char *c, *c2;
|
||||
struct irq_info *info;
|
||||
|
||||
if (getline(&line, &size, file)==0)
|
||||
break;
|
||||
|
@ -65,7 +67,11 @@ void parse_proc_interrupts(void)
|
|||
proc_int_has_msi = 1;
|
||||
|
||||
/* lines with letters in front are special, like NMI count. Ignore */
|
||||
if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9')))
|
||||
c = line;
|
||||
while (isblank(*(c)))
|
||||
c++;
|
||||
|
||||
if (!(*c>='0' && *c<='9'))
|
||||
break;
|
||||
c = strchr(line, ':');
|
||||
if (!c)
|
||||
|
@ -73,6 +79,10 @@ void parse_proc_interrupts(void)
|
|||
*c = 0;
|
||||
c++;
|
||||
number = strtoul(line, NULL, 10);
|
||||
info = get_irq_info(number);
|
||||
if (!info)
|
||||
info = add_misc_irq(number);
|
||||
|
||||
count = 0;
|
||||
cpunr = 0;
|
||||
|
||||
|
@ -89,17 +99,12 @@ void parse_proc_interrupts(void)
|
|||
if (cpunr != core_count)
|
||||
need_cpu_rescan = 1;
|
||||
|
||||
set_interrupt_count(number, count);
|
||||
info->last_irq_count = info->irq_count;
|
||||
info->irq_count = count;
|
||||
|
||||
/* is interrupt MSI based? */
|
||||
int_type = find_irq_integer_prop(number, IRQ_TYPE);
|
||||
if ((int_type == IRQ_TYPE_MSI) || (int_type == IRQ_TYPE_MSIX)) {
|
||||
if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX))
|
||||
msi_found_in_sysfs = 1;
|
||||
/* Set numa node for irq if it was MSI */
|
||||
if (debug_mode)
|
||||
printf("Set MSI interrupt for %d\n", number);
|
||||
set_msi_interrupt_numa(number);
|
||||
}
|
||||
}
|
||||
if ((proc_int_has_msi) && (!msi_found_in_sysfs)) {
|
||||
syslog(LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n");
|
||||
|
@ -113,3 +118,138 @@ void parse_proc_interrupts(void)
|
|||
fclose(file);
|
||||
free(line);
|
||||
}
|
||||
|
||||
|
||||
static void accumulate_irq_count(struct irq_info *info, void *data)
|
||||
{
|
||||
uint64_t *acc = data;
|
||||
|
||||
*acc += (info->irq_count - info->last_irq_count);
|
||||
}
|
||||
|
||||
static void assign_load_slice(struct irq_info *info, void *data)
|
||||
{
|
||||
uint64_t *load_slice = data;
|
||||
info->load = (info->irq_count - info->last_irq_count) * *load_slice;
|
||||
|
||||
/*
|
||||
* Every IRQ has at least a load of 1
|
||||
*/
|
||||
if (!info->load)
|
||||
info->load++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Recursive helper to estimate the number of irqs shared between
|
||||
* multiple topology objects that was handled by this particular object
|
||||
*/
|
||||
static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d)
|
||||
{
|
||||
uint64_t total_irq_count = 0;
|
||||
|
||||
if (d->parent) {
|
||||
total_irq_count = get_parent_branch_irq_count_share(d->parent);
|
||||
total_irq_count /= g_list_length(*d->obj_type_list);
|
||||
}
|
||||
|
||||
if (g_list_length(d->interrupts) > 0)
|
||||
for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_count);
|
||||
|
||||
return total_irq_count;
|
||||
}
|
||||
|
||||
static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused)))
|
||||
{
|
||||
uint64_t local_irq_counts = 0;
|
||||
uint64_t load_slice;
|
||||
int load_divisor = g_list_length(d->children);
|
||||
|
||||
d->load /= (load_divisor ? load_divisor : 1);
|
||||
|
||||
if (g_list_length(d->interrupts) > 0) {
|
||||
local_irq_counts = get_parent_branch_irq_count_share(d);
|
||||
load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1;
|
||||
for_each_irq(d->interrupts, assign_load_slice, &load_slice);
|
||||
}
|
||||
|
||||
if (d->parent)
|
||||
d->parent->load += d->load;
|
||||
}
|
||||
|
||||
void parse_proc_stat()
|
||||
{
|
||||
FILE *file;
|
||||
char *line = NULL;
|
||||
size_t size = 0;
|
||||
int cpunr, rc, cpucount;
|
||||
struct topo_obj *cpu;
|
||||
int irq_load, softirq_load;
|
||||
|
||||
file = fopen("/proc/stat", "r");
|
||||
if (!file) {
|
||||
syslog(LOG_WARNING, "WARNING cant open /proc/stat. balacing is broken\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* first line is the header we don't need; nuke it */
|
||||
if (getline(&line, &size, file)==0) {
|
||||
free(line);
|
||||
syslog(LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n");
|
||||
fclose(file);
|
||||
return;
|
||||
}
|
||||
|
||||
cpucount = 0;
|
||||
while (!feof(file)) {
|
||||
if (getline(&line, &size, file)==0)
|
||||
break;
|
||||
|
||||
if (!strstr(line, "cpu"))
|
||||
break;
|
||||
|
||||
cpunr = strtoul(&line[3], NULL, 10);
|
||||
|
||||
rc = sscanf(line, "%*s %*d %*d %*d %*d %*d %d %d", &irq_load, &softirq_load);
|
||||
if (rc < 2)
|
||||
break;
|
||||
|
||||
cpu = find_cpu_core(cpunr);
|
||||
|
||||
if (!cpu)
|
||||
break;
|
||||
|
||||
cpucount++;
|
||||
|
||||
/*
|
||||
* For each cpu add the irq and softirq load and propagate that
|
||||
* all the way up the device tree
|
||||
*/
|
||||
if (cycle_count) {
|
||||
cpu->load = (irq_load + softirq_load) - (cpu->last_load);
|
||||
/*
|
||||
* the [soft]irq_load values are in jiffies, which are
|
||||
* units of 10ms, multiply by 1000 to convert that to
|
||||
* 1/10 milliseconds. This give us a better integer
|
||||
* distribution of load between irqs
|
||||
*/
|
||||
cpu->load *= 1000;
|
||||
}
|
||||
cpu->last_load = (irq_load + softirq_load);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
if (cpucount != get_cpu_count()) {
|
||||
syslog(LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that we have load for each cpu attribute a fair share of the load
|
||||
* to each irq on that cpu
|
||||
*/
|
||||
for_each_object(cpus, compute_irq_branch_load_share, NULL);
|
||||
for_each_object(cache_domains, compute_irq_branch_load_share, NULL);
|
||||
for_each_object(packages, compute_irq_branch_load_share, NULL);
|
||||
for_each_object(numa_nodes, compute_irq_branch_load_share, NULL);
|
||||
|
||||
}
|
||||
|
|
100
types.h
100
types.h
|
@ -26,89 +26,39 @@
|
|||
#define IRQ_TYPE_MSI 1
|
||||
#define IRQ_TYPE_MSIX 2
|
||||
|
||||
|
||||
/*
|
||||
* IRQ properties
|
||||
*/
|
||||
enum irq_prop {
|
||||
IRQ_CLASS = 0,
|
||||
IRQ_TYPE,
|
||||
IRQ_NUMA,
|
||||
IRQ_LCPU_MASK,
|
||||
IRQ_MAX_PROPERTY
|
||||
enum obj_type_e {
|
||||
OBJ_TYPE_CPU,
|
||||
OBJ_TYPE_CACHE,
|
||||
OBJ_TYPE_PACKAGE,
|
||||
OBJ_TYPE_NODE
|
||||
};
|
||||
|
||||
struct package {
|
||||
uint64_t workload;
|
||||
struct topo_obj {
|
||||
uint64_t load;
|
||||
uint64_t last_load;
|
||||
enum obj_type_e obj_type;
|
||||
int number;
|
||||
|
||||
int powersave_mode;
|
||||
cpumask_t mask;
|
||||
int node_num;
|
||||
|
||||
int class_count[7];
|
||||
|
||||
GList *cache_domains;
|
||||
GList *interrupts;
|
||||
struct topo_obj *parent;
|
||||
GList *children;
|
||||
GList **obj_type_list;
|
||||
};
|
||||
|
||||
struct cache_domain {
|
||||
uint64_t workload;
|
||||
int number;
|
||||
|
||||
int marker;
|
||||
int node_num;
|
||||
|
||||
cpumask_t mask;
|
||||
|
||||
cpumask_t package_mask;
|
||||
|
||||
int class_count[7];
|
||||
|
||||
GList *cpu_cores;
|
||||
GList *interrupts;
|
||||
};
|
||||
|
||||
|
||||
struct cpu_core {
|
||||
uint64_t workload;
|
||||
int number;
|
||||
|
||||
int marker;
|
||||
int node_num;
|
||||
|
||||
int class_count[7];
|
||||
|
||||
cpumask_t package_mask;
|
||||
cpumask_t cache_mask;
|
||||
cpumask_t mask;
|
||||
|
||||
GList *interrupts;
|
||||
};
|
||||
|
||||
struct interrupt {
|
||||
uint64_t workload;
|
||||
|
||||
int balance_level;
|
||||
|
||||
int number;
|
||||
struct irq_info {
|
||||
int irq;
|
||||
int class;
|
||||
int node_num;
|
||||
int msi;
|
||||
|
||||
uint64_t count;
|
||||
uint64_t old_count;
|
||||
uint64_t extra;
|
||||
|
||||
cpumask_t mask;
|
||||
cpumask_t old_mask;
|
||||
|
||||
|
||||
cpumask_t numa_mask;
|
||||
cpumask_t allowed_mask;
|
||||
|
||||
/* user/driver provided for smarter balancing */
|
||||
cpumask_t node_mask;
|
||||
int type;
|
||||
int level;
|
||||
struct topo_obj *numa_node;
|
||||
cpumask_t cpumask;
|
||||
cpumask_t affinity_hint;
|
||||
uint64_t irq_count;
|
||||
uint64_t last_irq_count;
|
||||
uint64_t load;
|
||||
int moved;
|
||||
struct topo_obj *assigned_obj;
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in a new issue