Merge branch 'new_balancer'

Conflicts: classify.c
2011-10-12 13:27:10 -04:00 · 2011-10-12 13:27:10 -04:00 · 430c88e502
parent 966216aa17 ac6240a8bc
commit 430c88e502
14 changed files with 1081 additions and 1072 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies
 EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4
 
 INCLUDES = -I${top_srcdir} 
-LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma
+LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma -lm
 AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE
 noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \
 	types.h
--- a/activate.c
+++ b/activate.c
@ -32,30 +32,40 @@
 #include "irqbalance.h"


-void activate_mapping(void)
+static void activate_mapping(struct irq_info *info, void *data __attribute__((unused)))
 {
-	struct interrupt *irq;
-	GList *iter;
+	char buf[PATH_MAX];
+	FILE *file;
+	cpumask_t applied_mask;

-	iter = g_list_first(interrupts);
-	while (iter) {
-		irq = iter->data;
-		iter = g_list_next(iter);
+	/*
+ 	 * only activate mappings for irqs that have moved
+ 	 */
+	if (!info->moved)
+		return;

-		/* don't set the level if it's a NONE irq, or if there is
-		 * no change */
-		if (irq->balance_level != BALANCE_NONE && 
-			!cpus_equal(irq->mask, irq->old_mask)) {
-			char buf[PATH_MAX];
-			FILE *file;
-			sprintf(buf, "/proc/irq/%i/smp_affinity", irq->number);
-			file = fopen(buf, "w");
-			if (!file)
-				continue;
-			cpumask_scnprintf(buf, PATH_MAX, irq->mask);
-			fprintf(file,"%s", buf);
-			fclose(file);
-			irq->old_mask = irq->mask;
-		}
-	}
+	if (!info->assigned_obj)
+		return;
+
+
+	sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq);
+	file = fopen(buf, "w");
+	if (!file)
+		return;
+
+	if ((hint_policy == HINT_POLICY_EXACT) &&
+	    (!cpus_empty(info->affinity_hint)))
+		applied_mask = info->affinity_hint;
+	else
+		applied_mask = info->assigned_obj->mask;
+
+	cpumask_scnprintf(buf, PATH_MAX, applied_mask);
+	fprintf(file, "%s", buf);
+	fclose(file);
+	info->moved = 0; /*migration is done*/
+}
+
+void activate_mappings(void)
+{
+	for_each_irq(NULL, activate_mapping, NULL);
 }
--- a/classify.c
+++ b/classify.c
@ -25,8 +25,6 @@ int map_class_to_level[7] =
 { BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CACHE, BALANCE_NONE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE };


-int class_counts[7];
-
 #define MAX_CLASS 0x12
 /*
 * Class codes lifted from pci spec, appendix D.
@ -56,35 +54,6 @@ static short class_codes[MAX_CLASS] = {
 static GList *interrupts_db;

 #define SYSDEV_DIR "/sys/bus/pci/devices"
-union property {
-	int int_val;
-	cpumask_t mask_val;
-};
-
-enum irq_type {
-	INT_TYPE = 0,
-	CPUMASK_TYPE,
-};
-
-struct irq_property {
-	enum irq_type itype;
-	union property iproperty;
-};
-#define iint_val iproperty.int_val
-#define imask_val iproperty.mask_val
-
-struct irq_info {
-	int irq;
-	struct irq_property property[IRQ_MAX_PROPERTY];
-};
-
-static void init_new_irq(struct irq_info *new)
-{
-	new->property[IRQ_CLASS].itype = INT_TYPE;
-	new->property[IRQ_TYPE].itype = INT_TYPE;
-	new->property[IRQ_NUMA].itype = INT_TYPE;
-	new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE;
-}

 static gint compare_ints(gconstpointer a, gconstpointer b)
 {
@ -94,11 +63,6 @@ static gint compare_ints(gconstpointer a, gconstpointer b)
 	return ai->irq - bi->irq;
 }

-static void free_int(gpointer data)
-{
-	free(data);
-}
-
 /*
 * Inserts an irq_info struct into the intterupts_db list
 * devpath points to the device directory in sysfs for the 
@ -126,13 +90,12 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
 		return NULL;
 	}

-	new = malloc(sizeof(struct irq_info));
+	new = calloc(sizeof(struct irq_info), 1);
 	if (!new)
 		return NULL;
-	init_new_irq(new);

 	new->irq = irq;
-	new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
+	new->class = IRQ_OTHER;

 	interrupts_db = g_list_append(interrupts_db, new);

@ -159,7 +122,9 @@ static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
 	if (class >= MAX_CLASS)
 		goto get_numa_node;

-	new->property[IRQ_CLASS].iint_val = class_codes[class];
+	new->class = class_codes[class];
+	new->level = map_class_to_level[class_codes[class]];
+
 get_numa_node:
 	numa_node = -1;
 	sprintf(path, "%s/numa_node", devpath);
@ -171,24 +136,39 @@ get_numa_node:
 	fclose(fd);

 assign_node:
-	new->property[IRQ_NUMA].iint_val = numa_node;
+	new->numa_node = get_numa_node(numa_node);

 	sprintf(path, "%s/local_cpus", devpath);
 	fd = fopen(path, "r");
 	if (!fd) {
-		cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
-		goto out;
+		cpus_setall(new->cpumask);
+		goto assign_affinity_hint;
 	}
 	lcpu_mask = NULL;
 	rc = fscanf(fd, "%as", &lcpu_mask);
 	fclose(fd);
-	if (!lcpu_mask) {
-		cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
+	if (!lcpu_mask || !rc) {
+		cpus_setall(new->cpumask);
 	} else {
 		cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
-				   new->property[IRQ_LCPU_MASK].imask_val);
-		free(lcpu_mask);
+				   new->cpumask);
 	}
+	free(lcpu_mask);
+
+assign_affinity_hint:
+	cpus_clear(new->affinity_hint);
+	sprintf(path, "/proc/irq/%d/affinity_hint", irq);
+	fd = fopen(path, "r");
+	if (!fd)
+		goto out;
+	lcpu_mask = NULL;
+	rc = fscanf(fd, "%as", &lcpu_mask);
+	fclose(fd);
+	if (!lcpu_mask)
+		goto out;
+	cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
+			   new->affinity_hint);
+	free(lcpu_mask);
 out:
 	if (debug_mode)
 		printf("Adding IRQ %d to database\n", irq);
@ -226,7 +206,7 @@ static void build_one_dev_entry(const char *dirname)
 				new = add_one_irq_to_db(path, irqnum);
 				if (!new)
 					continue;
-				new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX;
+				new->type = IRQ_TYPE_MSIX;
 			}
 		} while (entry != NULL);
 		closedir(msidir);
@ -248,20 +228,32 @@ static void build_one_dev_entry(const char *dirname)
 		new = add_one_irq_to_db(path, irqnum);
 		if (!new)
 			goto done;
-		new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
+		new->type = IRQ_TYPE_LEGACY;
 	}
+
 done:
 	fclose(fd);
 	return;
 }

+static void free_irq(struct irq_info *info, void *data __attribute__((unused)))
+{
+	free(info);
+}
+
+void free_irq_db(void)
+{
+	for_each_irq(NULL, free_irq, NULL);
+	g_list_free(interrupts_db);
+	interrupts_db = NULL;
+}

 void rebuild_irq_db(void)
 {
 	DIR *devdir = opendir(SYSDEV_DIR);
 	struct dirent *entry;

-	g_list_free_full(interrupts_db, free_int);
+	free_irq_db();
 		
 	if (!devdir)
 		return;
@ -278,83 +270,80 @@ void rebuild_irq_db(void)
 	closedir(devdir);
 }

-static GList *add_misc_irq(int irq)
+struct irq_info *add_misc_irq(int irq)
 {
-	struct irq_info *new, find;
+	struct irq_info *new;

-	new = malloc(sizeof(struct irq_info));
+	new = calloc(sizeof(struct irq_info), 1);
 	if (!new)
 		return NULL;
-	init_new_irq(new);

 	new->irq = irq;
-	new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
-	new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
-	new->property[IRQ_NUMA].iint_val = -1;
+	new->type = IRQ_TYPE_LEGACY;
+	new->class = IRQ_OTHER;
+	new->numa_node = get_numa_node(0);
 	interrupts_db = g_list_append(interrupts_db, new);
-	find.irq = irq;
-	return g_list_find_custom(interrupts_db, &find, compare_ints);	
+	return new;
 }

-int find_irq_integer_prop(int irq, enum irq_prop prop)
+void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data)
 {
-	GList *entry;
-	struct irq_info find, *result;
-	
-	find.irq = irq;
+	GList *entry = g_list_first(list ? list : interrupts_db);
+	GList *next;

-	entry = g_list_find_custom(interrupts_db, &find, compare_ints);
-
-	if (!entry) {
-		if (debug_mode)
-			printf("No entry for irq %d in the irq database, adding default entry\n", irq);
-		entry = add_misc_irq(irq);
+	while (entry) {
+		next = g_list_next(entry);
+		cb(entry->data, data);
+		entry = next;
 	}
-
-	result = entry->data;
-	assert(result->property[prop].itype == INT_TYPE);
-	return result->property[prop].iint_val;
 }

-cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop)
+struct irq_info *get_irq_info(int irq)
 {
 	GList *entry;
-	struct irq_info find, *result;
-	
-	find.irq = irq;
-
-	entry = g_list_find_custom(interrupts_db, &find, compare_ints);
-
-	if (!entry) {
-		if (debug_mode)
-			printf("No entry for irq %d in the irq database, adding default entry\n", irq);
-		entry = add_misc_irq(irq);
-	}
-
-	result = entry->data;
-	assert(result->property[prop].itype == CPUMASK_TYPE);
-	return result->property[prop].imask_val;
-}
-
-int get_next_irq(int irq)
-{
-	GList *entry;
-	struct irq_info *irqp, find;
-
-	if (irq == -1) {
-		entry = g_list_first(interrupts_db);
-		irqp = entry->data;
-		return irqp->irq;
-	}
+	struct irq_info find;

 	find.irq = irq;
 	entry = g_list_find_custom(interrupts_db, &find, compare_ints);
-	if (!entry)
+	return entry ? entry->data : NULL;
+}
+
+void migrate_irq(GList **from, GList **to, struct irq_info *info)
+{
+	GList *entry;
+	struct irq_info find, *tmp;;
+
+	find.irq = info->irq;
+	entry = g_list_find_custom(*from, &find, compare_ints);
+	tmp = entry->data;
+	*from = g_list_delete_link(*from, entry);
+
+
+	*to = g_list_append(*to, tmp);
+	info->moved = 1;
+}
+
+static gint sort_irqs(gconstpointer A, gconstpointer B)
+{
+        struct irq_info *a, *b;
+        a = (struct irq_info*)A;
+        b = (struct irq_info*)B;
+
+	if (a->class < b->class)
+		return 1;
+	if (a->class > b->class)
 		return -1;
-
-	entry = g_list_next(entry);
-	if (!entry)
+	if (a->load < b->load)
+		return 1;
+	if (a->load > b->load)
 		return -1;
-	irqp= entry->data;
-	return irqp->irq;
+	if (a<b)
+		return 1;
+        return -1;
+
+}
+
+void sort_irq_list(GList **list)
+{
+	*list = g_list_sort(*list, sort_irqs);
 }
--- a/configure.ac
+++ b/configure.ac
@ -51,10 +51,11 @@ AC_PROG_AWK
 echo .
 echo Checking for header files
 AC_HEADER_STDC
-AC_CHECK_HEADERS(linux/ethtool.h linux/sockios.h, [], [])

 AC_CHECK_FUNCS(getopt_long)

+AC_CHECK_LIB(numa, numa_run_on_node, [], [])
+AC_CHECK_LIB(m, floor, [], [])

 AC_C_CONST
 AC_C_INLINE
--- a/cputree.c
+++ b/cputree.c
@ -55,122 +55,103 @@ cpumask_t cpu_possible_map;
 */
 static cpumask_t unbanned_cpus;

-static int search_numa_node(cpumask_t mask)
-{
-	int node_num, ret;
-	struct bitmask *node_mask;
-	cpumask_t cpu_node_mask;
-
-	node_num = numa_num_configured_nodes();
-
-	if (node_num < 1)
-		return -1;
-
-	node_mask = numa_allocate_cpumask();
-
-	node_num--; /* indexing from zero */
-
-	while (node_num >= 0) {
-		ret = numa_node_to_cpus(node_num, node_mask);
-		if (ret) {
-			node_num--;
-			continue;
-		}
-		memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long));
-		if (cpus_intersects(mask, cpu_node_mask)) {
-			numa_free_cpumask(node_mask);
-			return node_num; 
-		}
-		node_num--;
-	}
-
-	numa_free_cpumask(node_mask);
-	return node_num;
-}
-
-static void fill_packages(void)
+static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, 
+						    cpumask_t package_mask)
 {
 	GList *entry;
+	struct topo_obj *package;
+	struct topo_obj *lcache; 
+
+	entry = g_list_first(packages);

-	entry = g_list_first(cache_domains);
 	while (entry) {
-		struct package *package;
-		struct cache_domain *cache = NULL;
-		GList *entry2;
-
-		cache = entry->data;
-		entry2 = entry;
-		entry = g_list_next(entry);
-		if (cache->marker) 
-			continue;
-		package = malloc(sizeof(struct package));
-		if (!package)
+		package = entry->data;
+		if (cpus_equal(package_mask, package->mask))
 			break;
-		memset(package, 0, sizeof(struct package));
-		package->mask = cache->package_mask;
-		package->number = cache->number;
-		package->node_num = search_numa_node(package->mask);
-		while (entry2) {
-			struct cache_domain *cache2;
-			cache2 = entry2->data;
-			if (cpus_equal(cache->package_mask, cache2->package_mask)) {
-				cache2->marker = 1;
-				package->cache_domains = g_list_append(package->cache_domains, cache2);
-				if (package->number > cache2->number)
-					package->number = cache2->number;
-			}
-			entry2 = g_list_next(entry2);
-		}
+		entry = g_list_next(entry);
+	}
+
+	if (!entry) {
+		package = calloc(sizeof(struct topo_obj), 1);
+		if (!package)
+			return NULL;
+		package->mask = package_mask;
+		package->obj_type = OBJ_TYPE_PACKAGE;
+		package->obj_type_list = &packages;
 		packages = g_list_append(packages, package);
 		package_count++;
 	}
-}

-static void fill_cache_domain(void)
+	entry = g_list_first(package->children);
+	while (entry) {
+		lcache = entry->data;
+		if (lcache == cache)
+			break;
+		entry = g_list_next(entry);
+	}
+
+	if (!entry) {
+		package->children = g_list_append(package->children, cache);
+		cache->parent = package;
+	}
+
+	return package;
+}
+static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu,
+						    cpumask_t cache_mask)
 {
 	GList *entry;
+	struct topo_obj *cache;
+	struct topo_obj *lcpu;
+
+	entry = g_list_first(cache_domains);

-	entry = g_list_first(cpus);
 	while (entry) {
-		struct cache_domain *cache = NULL;
-		struct cpu_core *cpu;
-		GList *entry2;
-		cpu = entry->data;
-		entry2 = entry;
-		entry = g_list_next(entry);
-		if (cpu->marker) 
-			continue;
-		cache = malloc(sizeof(struct cache_domain));
-		if (!cache)
+		cache = entry->data;
+		if (cpus_equal(cache_mask, cache->mask))
 			break;
-		memset(cache, 0, sizeof(struct cache_domain));
-		cache->mask = cpu->cache_mask;
-		cache->package_mask = cpu->package_mask;
-		cache->number = cpu->number;
-		cache->node_num = search_numa_node(cache->mask);
+		entry = g_list_next(entry);
+	}
+
+	if (!entry) {
+		cache = calloc(sizeof(struct topo_obj), 1);
+		if (!cache)
+			return NULL;
+		cache->obj_type = OBJ_TYPE_CACHE;
+		cache->mask = cache_mask;
+		cache->number = cache_domain_count;
+		cache->obj_type_list = &cache_domains;
 		cache_domains = g_list_append(cache_domains, cache);
 		cache_domain_count++;
-		while (entry2) {
-			struct cpu_core *cpu2;
-			cpu2 = entry2->data;
-			if (cpus_equal(cpu->cache_mask, cpu2->cache_mask) && 
-			    cpus_equal(cpu->package_mask, cpu2->package_mask)) {
-				cpu2->marker = 1;
-				cache->cpu_cores = g_list_append(cache->cpu_cores, cpu2);
-				if (cpu2->number < cache->number)
-					cache->number = cpu2->number;
-			}
-			entry2 = g_list_next(entry2);
-		}
 	}
+
+	entry = g_list_first(cache->children);
+	while (entry) {
+		lcpu = entry->data;
+		if (lcpu == cpu)
+			break;
+		entry = g_list_next(entry);
+	}
+
+	if (!entry) {
+		cache->children = g_list_append(cache->children, cpu);
+		cpu->parent = (struct topo_obj *)cache;
+	}
+
+	return cache;
 }
-
-
+ 
 static void do_one_cpu(char *path)
 {
-	struct cpu_core *cpu;
+	struct topo_obj *cpu;
 	FILE *file;
 	char new_path[PATH_MAX];
+	cpumask_t cache_mask, package_mask;
+	struct topo_obj *cache;
+	struct topo_obj *package;
+	DIR *dir;
+	struct dirent *entry;
+	int nodeid;

 	/* skip offline cpus */
 	snprintf(new_path, PATH_MAX, "%s/online", path);
@ -188,10 +169,11 @@ static void do_one_cpu(char *path)
 		free(line);
 	}

-	cpu = malloc(sizeof(struct cpu_core));
+	cpu = calloc(sizeof(struct topo_obj), 1);
 	if (!cpu)
 		return;
-	memset(cpu, 0, sizeof(struct cpu_core));
+
+	cpu->obj_type = OBJ_TYPE_CPU;

 	cpu->number = strtoul(&path[27], NULL, 10);

@ -199,9 +181,6 @@ static void do_one_cpu(char *path)
 	
 	cpu_set(cpu->number, cpu->mask);

-	/* set numa node of cpu */
-	cpu->node_num = search_numa_node(cpu->mask);
-
 	/* if the cpu is on the banned list, just don't add it */
 	if (cpus_intersects(cpu->mask, banned_cpus)) {
 		free(cpu);
@ -214,26 +193,26 @@ static void do_one_cpu(char *path)
 	/* try to read the package mask; if it doesn't exist assume solitary */
 	snprintf(new_path, PATH_MAX, "%s/topology/core_siblings", path);
 	file = fopen(new_path, "r");
-	cpu_set(cpu->number, cpu->package_mask);
+	cpu_set(cpu->number, package_mask);
 	if (file) {
 		char *line = NULL;
 		size_t size = 0;
 		if (getline(&line, &size, file)) 
-			cpumask_parse_user(line, strlen(line), cpu->package_mask);
+			cpumask_parse_user(line, strlen(line), package_mask);
 		fclose(file);
 		free(line);
 	}

 	/* try to read the cache mask; if it doesn't exist assume solitary */
 	/* We want the deepest cache level available so try index1 first, then index2 */
-	cpu_set(cpu->number, cpu->cache_mask);
+	cpu_set(cpu->number, cache_mask);
 	snprintf(new_path, PATH_MAX, "%s/cache/index1/shared_cpu_map", path);
 	file = fopen(new_path, "r");
 	if (file) {
 		char *line = NULL;
 		size_t size = 0;
 		if (getline(&line, &size, file)) 
-			cpumask_parse_user(line, strlen(line), cpu->cache_mask);
+			cpumask_parse_user(line, strlen(line), cache_mask);
 		fclose(file);
 		free(line);
 	}
@ -243,66 +222,94 @@ static void do_one_cpu(char *path)
 		char *line = NULL;
 		size_t size = 0;
 		if (getline(&line, &size, file)) 
-			cpumask_parse_user(line, strlen(line), cpu->cache_mask);
+			cpumask_parse_user(line, strlen(line), cache_mask);
 		fclose(file);
 		free(line);
 	}

+	nodeid=0;
+	dir = opendir(path);
+	do {
+		entry = readdir(dir);
+		if (!entry)
+			break;
+		if (strstr(entry->d_name, "node")) {
+			nodeid = strtoul(&entry->d_name[4], NULL, 10);
+			break;
+		}
+	} while (entry);
+	closedir(dir);
+
+	cache = add_cpu_to_cache_domain(cpu, cache_mask);
+	package = add_cache_domain_to_package(cache, package_mask);
+	add_package_to_node(package, nodeid);	
+ 
 	/* 
 	   blank out the banned cpus from the various masks so that interrupts
 	   will never be told to go there
 	 */
-	cpus_and(cpu->cache_mask, cpu->cache_mask, unbanned_cpus);
-	cpus_and(cpu->package_mask, cpu->package_mask, unbanned_cpus);
+	cpus_and(cpu_cache_domain(cpu)->mask, cpu_cache_domain(cpu)->mask, unbanned_cpus);
+	cpus_and(cpu_package(cpu)->mask, cpu_package(cpu)->mask, unbanned_cpus);
 	cpus_and(cpu->mask, cpu->mask, unbanned_cpus);

+	cpu->obj_type_list = &cpus;
 	cpus = g_list_append(cpus, cpu);
 	core_count++;
 }

-static void dump_irqs(int spaces, GList *dump_interrupts)
+static void dump_irq(struct irq_info *info, void *data)
 {
-	struct interrupt *irq;
-	while (dump_interrupts) {
-		int i;
-		for (i=0; i<spaces; i++) printf(" ");
-		irq = dump_interrupts->data;
-		printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload);
-		dump_interrupts = g_list_next(dump_interrupts);
-	}
+	int spaces = (long int)data;
+	int i;
+	for (i=0; i<spaces; i++) printf(" ");
+	printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned int)info->load);
+}
+
+static void dump_topo_obj(struct topo_obj *d, void *data __attribute__((unused)))
+{
+	struct topo_obj *c = (struct topo_obj *)d;
+	printf("                CPU number %i  numa_node is %d (load %lu)\n", c->number, cpu_numa_node(c)->number , (unsigned long)c->load);
+	if (c->interrupts)
+		for_each_irq(c->interrupts, dump_irq, (void *)18);
+}
+
+static void dump_cache_domain(struct topo_obj *d, void *data)
+{
+	char *buffer = data;
+	cpumask_scnprintf(buffer, 4095, d->mask);
+	printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (load %lu) \n", d->number, cache_domain_numa_node(d)->number, buffer, (unsigned long)d->load);
+	if (d->children)
+		for_each_object(d->children, dump_topo_obj, NULL);
+	if (d->interrupts)
+		for_each_irq(d->interrupts, dump_irq, (void *)10);
+}
+
+static void dump_package(struct topo_obj *d, void *data)
+{
+	char *buffer = data;
+	cpumask_scnprintf(buffer, 4096, d->mask);
+	printf("Package %i:  numa_node is %d cpu mask is %s (load %lu)\n", d->number, package_numa_node(d)->number, buffer, (unsigned long)d->load);
+	if (d->children)
+		for_each_object(d->children, dump_cache_domain, buffer);
+	if (d->interrupts)
+		for_each_irq(d->interrupts, dump_irq, (void *)2);
 }

 void dump_tree(void)
 {
-	GList *p_iter, *c_iter, *cp_iter;
-	struct package *package;
-	struct cache_domain *cache_domain;
-	struct cpu_core *cpu;
-
 	char buffer[4096];
-	p_iter = g_list_first(packages);
-	while (p_iter) {
-		package = p_iter->data;
-		cpumask_scnprintf(buffer, 4096, package->mask);
-		printf("Package %i:  numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload);
-		c_iter = g_list_first(package->cache_domains);
-		while (c_iter) {
-			cache_domain = c_iter->data;
-			c_iter = g_list_next(c_iter);
-			cpumask_scnprintf(buffer, 4095, cache_domain->mask);
-			printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload);
-			cp_iter = cache_domain->cpu_cores;
-			while (cp_iter) {
-				cpu = cp_iter->data;
-				cp_iter = g_list_next(cp_iter);
-				printf("                CPU number %i  numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload);
-				dump_irqs(18, cpu->interrupts);
-			}
-			dump_irqs(10, cache_domain->interrupts);
-		}
-		dump_irqs(2, package->interrupts);
-		p_iter = g_list_next(p_iter);
-	}
+	for_each_object(packages, dump_package, buffer);
+}
+
+static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused)))
+{
+	info->load = 0;
+}
+
+static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused)))
+{
+	for_each_object(d->children, clear_obj_stats, NULL);
+	for_each_irq(d->interrupts, clear_irq_stats, NULL);
 }

 /*
@ -310,40 +317,9 @@ void dump_tree(void)
 * which level does how much work and the actual lists of interrupts 
 * assigned to each component
 */
-void clear_work_stats(void)
+void clear_work_stats()
 {
-	GList *p_iter, *c_iter, *cp_iter;
-	struct package *package;
-	struct cache_domain *cache_domain;
-	struct cpu_core *cpu;
-
-	p_iter = g_list_first(packages);
-	while (p_iter) {
-		package = p_iter->data;
-		package->workload = 0;
-		g_list_free(package->interrupts);
-		package->interrupts = NULL;
-		c_iter = g_list_first(package->cache_domains);
-		memset(package->class_count, 0, sizeof(package->class_count));
-		while (c_iter) {
-			cache_domain = c_iter->data;
-			c_iter = g_list_next(c_iter);
-			cache_domain->workload = 0;
-			cp_iter = cache_domain->cpu_cores;
-			g_list_free(cache_domain->interrupts);
-			cache_domain->interrupts = NULL;
-			memset(cache_domain->class_count, 0, sizeof(cache_domain->class_count));
-			while (cp_iter) {
-				cpu = cp_iter->data;
-				cp_iter = g_list_next(cp_iter);
-				cpu->workload = 0;
-				g_list_free(cpu->interrupts);
-				cpu->interrupts = NULL;
-				memset(cpu->class_count, 0, sizeof(cpu->class_count));
-			}
-		}
-		p_iter = g_list_next(p_iter);
-	}
+	for_each_object(numa_nodes, clear_obj_stats, NULL);
 }


@ -373,9 +349,6 @@ void parse_cpu_tree(void)
 	} while (entry);
 	closedir(dir);  

-	fill_cache_domain();
-	fill_packages();
-
 	if (debug_mode)
 		dump_tree();

@ -389,14 +362,14 @@ void parse_cpu_tree(void)
 void clear_cpu_tree(void)
 {
 	GList *item;
-	struct cpu_core *cpu;
-	struct cache_domain *cache_domain;
-	struct package *package;
+	struct topo_obj *cpu;
+	struct topo_obj *cache_domain;
+	struct topo_obj *package;

 	while (packages) {
 		item = g_list_first(packages);
 		package = item->data;
-		g_list_free(package->cache_domains);
+		g_list_free(package->children);
 		g_list_free(package->interrupts);
 		free(package);
 		packages = g_list_delete_link(packages, item);
@ -406,7 +379,7 @@ void clear_cpu_tree(void)
 	while (cache_domains) {
 		item = g_list_first(cache_domains);
 		cache_domain = item->data;
-		g_list_free(cache_domain->cpu_cores);
+		g_list_free(cache_domain->children);
 		g_list_free(cache_domain->interrupts);
 		free(cache_domain);
 		cache_domains = g_list_delete_link(cache_domains, item);
@ -424,3 +397,28 @@ void clear_cpu_tree(void)
 	core_count = 0;

 }
+
+static gint compare_cpus(gconstpointer a, gconstpointer b)
+{
+	const struct topo_obj *ai = a;
+	const struct topo_obj *bi = b;
+
+	return ai->number - bi->number;	
+}
+
+struct topo_obj *find_cpu_core(int cpunr)
+{
+	GList *entry;
+	struct topo_obj find;
+
+	find.number = cpunr;
+	entry = g_list_find_custom(cpus, &find, compare_cpus);
+
+	return entry ? entry->data : NULL;
+}	
+
+int get_cpu_count(void)
+{
+	return g_list_length(cpus);
+}
+
--- a/irqbalance.1
+++ b/irqbalance.1
@ -41,6 +41,19 @@ Causes irqbalance to be run once, after which the daemon exits
 .B --debug
 Causes irqbalance to run in the foreground and extra debug information to be printed

+.TP
+.B --hintpolicy=[exact | subset | ignore]
+Set the policy for how irq kernel affinity hinting is treated.  Can be one of:
+.P
+.I exact
+irq affinity hint is applied unilaterally and never violated
+.P
+.I subset
+irq is balanced, but the assigned object will be a subset of the affintiy hint
+.P
+.I ignore
+irq affinity hint value is completely ignored
+
 .SH "ENVIRONMENT VARIABLES"
 .TP
 .B IRQBALANCE_ONESHOT
--- a/irqbalance.c
+++ b/irqbalance.c
@ -38,13 +38,11 @@
 int one_shot_mode;
 int debug_mode;
 int numa_avail;
-
 int need_cpu_rescan;
-
 extern cpumask_t banned_cpus;
-
-static int counter;
-
+enum hp_e hint_policy = HINT_POLICY_SUBSET;
+unsigned long power_thresh = ULONG_MAX;
+unsigned long long cycle_count = 0;

 void sleep_approx(int seconds)
 {
@ -64,12 +62,15 @@ void sleep_approx(int seconds)
 struct option lopts[] = {
 	{"oneshot", 0, NULL, 'o'},
 	{"debug", 0, NULL, 'd'},
+	{"hintpolicy", 1, NULL, 'h'},
+	{"powerthresh", 1, NULL, 'p'},
 	{0, 0, 0, 0}
 };

 static void usage(void)
 {
-	printf("irqbalance [--oneshot | -o] [--debug | -d]");
+	printf("irqbalance [--oneshot | -o] [--debug | -d] [--hintpolicy= | -h [exact|subset|ignore]]\n");
+	printf("	[--powerthresh= | -p <off> | <n>]\n");
 }

 static void parse_command_line(int argc, char **argv)
@ -78,7 +79,7 @@ static void parse_command_line(int argc, char **argv)
 	int longind;

 	while ((opt = getopt_long(argc, argv,
-		"",
+		"odh:p:",
 		lopts, &longind)) != -1) {

 		switch(opt) {
@ -88,6 +89,29 @@ static void parse_command_line(int argc, char **argv)
 			case 'd':
 				debug_mode=1;
 				break;
+			case 'h':
+				if (!strncmp(optarg, "exact", strlen(optarg)))
+					hint_policy = HINT_POLICY_EXACT;
+				else if (!strncmp(optarg, "subset", strlen(optarg)))
+					hint_policy = HINT_POLICY_SUBSET;
+				else if (!strncmp(optarg, "ignore", strlen(optarg)))
+					hint_policy = HINT_POLICY_IGNORE;
+				else {
+					usage();
+					exit(1);
+				}
+				break;
+			case 'p':
+				if (!strncmp(optarg, "off", strlen(optarg)))
+					power_thresh = ULONG_MAX;
+				else {
+					power_thresh = strtoull(optarg, NULL, 10);
+					if (power_thresh == ULONG_MAX) {
+						usage();
+						exit(1);
+					}
+				}
+				break;
 			case 'o':
 				one_shot_mode=1;
 				break;
@ -96,6 +120,50 @@ static void parse_command_line(int argc, char **argv)
 }
 #endif

+/*
+ * This builds our object tree.  The Heirarchy is pretty straightforward
+ * At the top are numa_nodes
+ * All CPU packages belong to a single numa_node
+ * All Cache domains belong to a CPU package
+ * All CPU cores belong to a cache domain
+ *
+ * Objects are built in that order (top down)
+ *
+ * Object workload is the aggregate sum of the
+ * workload of the objects below it
+ */
+static void build_object_tree()
+{
+	build_numa_node_list();
+	parse_cpu_tree();
+	rebuild_irq_db();
+}
+
+static void free_object_tree()
+{
+	free_numa_node_list();
+	clear_cpu_tree();
+	free_irq_db();
+}
+
+static void dump_object_tree()
+{
+	for_each_object(numa_nodes, dump_numa_node_info, NULL);
+}
+
+static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused)))
+{
+	if (info->level == BALANCE_NONE)
+		return;
+
+	if (info->assigned_obj == NULL)
+		rebalance_irq_list = g_list_append(rebalance_irq_list, info);
+	else
+		migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
+
+	info->assigned_obj = NULL;
+}
+
 int main(int argc, char** argv)
 {

@ -125,9 +193,9 @@ int main(int argc, char** argv)
 	}


-	rebuild_irq_db();
-
-	parse_cpu_tree();
+	build_object_tree();
+	if (debug_mode)
+		dump_object_tree();


 	/* On single core UP systems irqbalance obviously has no work to do */
@ -150,15 +218,10 @@ int main(int argc, char** argv)
 	capng_apply(CAPNG_SELECT_BOTH);
 #endif

+	for_each_irq(NULL, force_rebalance_irq, NULL);
+
 	parse_proc_interrupts();
-	sleep(SLEEP_INTERVAL/4);
-	reset_counts();
-	parse_proc_interrupts();
-	pci_numa_scan();
-	calculate_workload();
-	sort_irq_list();
-	if (debug_mode)
-		dump_workloads();
+	parse_proc_stat();

 	while (1) {
 		sleep_approx(SLEEP_INTERVAL);
@ -166,8 +229,9 @@ int main(int argc, char** argv)
 			printf("\n\n\n-----------------------------------------------------------------------------\n");


-		check_power_mode();
+		clear_work_stats();
 		parse_proc_interrupts();
+		parse_proc_stat();

 		/* cope with cpu hotplug -- detected during /proc/interrupts parsing */
 		if (need_cpu_rescan) {
@ -179,25 +243,31 @@ int main(int argc, char** argv)
 			reset_counts();
 			clear_work_stats();

-			clear_cpu_tree();
-			parse_cpu_tree();
-		}
+			free_object_tree();
+			build_object_tree();
+			for_each_irq(NULL, force_rebalance_irq, NULL);
+			parse_proc_interrupts();
+			parse_proc_stat();
+			sleep_approx(SLEEP_INTERVAL);
+			clear_work_stats();
+			parse_proc_interrupts();
+			parse_proc_stat();
+			cycle_count=0;
+		} 

-		calculate_workload();
-
-		/* to cope with dynamic configurations we scan for new numa information
-		 * once every 5 minutes
-		 */
-		pci_numa_scan();
+		if (cycle_count)	
+			update_migration_status();

 		calculate_placement();
-		activate_mapping();
+		activate_mappings();
 	
 		if (debug_mode)
 			dump_tree();
 		if (one_shot_mode)
 			break;
-		counter++;
+		cycle_count++;
+
 	}
+	free_object_tree();
 	return EXIT_SUCCESS;
 }
--- a/irqbalance.h
+++ b/irqbalance.h
@ -8,47 +8,116 @@

 #include <stdint.h>
 #include <glib.h>
+#include <syslog.h>

 #include "types.h"
 #include <numa.h>

-struct interrupt;
-
 extern int package_count;
 extern int cache_domain_count;
 extern int core_count;
 extern char *classes[];
-extern int map_class_to_level[7];
-extern int class_counts[7];
-extern int debug_mode;
-extern int power_mode;
-extern int need_cpu_rescan;
-extern int one_shot_mode;
-extern GList *interrupts;

 extern void parse_cpu_tree(void);
 extern void clear_work_stats(void);
 extern void parse_proc_interrupts(void);
-extern void rebuild_irq_db(void);
+extern void parse_proc_stat(void);
 extern void set_interrupt_count(int number, uint64_t count);
 extern void set_msi_interrupt_numa(int number);
-extern int get_next_irq(int irq);
-extern int find_irq_integer_prop(int irq, enum irq_prop prop);
-extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop);

-extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type);
+extern GList *rebalance_irq_list;

-void calculate_workload(void);
+void update_migration_status(void);
 void reset_counts(void);
 void dump_workloads(void);
-void sort_irq_list(void);
+void sort_irq_list(GList **list);
 void calculate_placement(void);
 void dump_tree(void);

-void activate_mapping(void);
+void activate_mappings(void);
 void account_for_nic_stats(void);
 void check_power_mode(void);
 void clear_cpu_tree(void);
 void pci_numa_scan(void);

+/*===================NEW BALANCER FUNCTIONS============================*/
+
+/*
+ * Master topo_obj type lists
+ */
+extern GList *numa_nodes;
+extern GList *packages;
+extern GList *cache_domains;
+extern GList *cpus;
+
+enum hp_e {
+	HINT_POLICY_IGNORE,
+	HINT_POLICY_SUBSET,
+	HINT_POLICY_EXACT
+};
+
+extern int debug_mode;
+extern int one_shot_mode;
+extern int power_mode;
+extern int need_cpu_rescan;
+extern enum hp_e hint_policy;
+extern unsigned long long cycle_count;
+extern unsigned long power_thresh;
+
+/*
+ * Numa node access routines
+ */
+extern void build_numa_node_list(void);
+extern void free_numa_node_list(void);
+extern void dump_numa_node_info(struct topo_obj *node, void *data);
+extern void add_package_to_node(struct topo_obj *p, int nodeid);
+extern struct topo_obj *get_numa_node(int nodeid);
+
+/*
+ * Package functions
+ */
+#define package_numa_node(p) ((p)->parent)
+
+/*
+ * cache_domain functions
+ */
+#define cache_domain_package(c) ((c)->parent)
+#define cache_domain_numa_node(c) (package_numa_node(cache_domain_package((c))))
+
+/*
+ * cpu core functions
+ */
+#define cpu_cache_domain(cpu) ((cpu)->parent)
+#define cpu_package(cpu) (cache_domain_package(cpu_cache_domain((cpu))))
+#define cpu_numa_node(cpu) (package_numa_node(cache_domain_package(cpu_cache_domain((cpu)))))
+extern struct topo_obj *find_cpu_core(int cpunr);
+extern int get_cpu_count(void);
+
+/*
+ * irq db functions
+ */
+extern void rebuild_irq_db(void);
+extern void free_irq_db(void);
+extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info,  void *data), void *data);
+extern struct irq_info *get_irq_info(int irq);
+extern void migrate_irq(GList **from, GList **to, struct irq_info *info);
+extern struct irq_info *add_misc_irq(int irq);
+#define irq_numa_node(irq) ((irq)->numa_node)
+
+
+/*
+ * Generic object functions
+ */
+static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj,  void *data), void *data)
+{
+	GList *entry, *next;
+	entry = g_list_first(list);
+	while (entry) {
+		next = g_list_next(entry);
+		cb(entry->data, data);
+		entry = next;
+	}
+}
+
 #endif
+
--- a/irqlist.c
+++ b/irqlist.c
@ -29,285 +29,183 @@
 #include <sys/types.h>
 #include <dirent.h>
 #include <errno.h>
+#include <math.h>

 #include "types.h"
 #include "irqbalance.h"

-GList *interrupts;


+struct load_balance_info {
+	unsigned long long int total_load;
+	unsigned long long avg_load;
+	int load_sources;
+	unsigned long long int deviations;
+	long double std_deviation;
+	unsigned int num_within;
+	unsigned int num_over;
+	unsigned int num_under;
+	struct topo_obj *powersave;
+};

-void get_affinity_hint(struct interrupt *irq, int number)
+static void gather_load_stats(struct topo_obj *obj, void *data)
 {
-	char buf[PATH_MAX];
-	cpumask_t tempmask;
-	char *line = NULL;
-	size_t size = 0;
-	FILE *file;
-	sprintf(buf, "/proc/irq/%i/affinity_hint", number);
-	file = fopen(buf, "r");
-	if (!file)
-		return;
-	if (getline(&line, &size, file)==0) {
-		free(line);
-		fclose(file);
-		return;
-	}
-	cpumask_parse_user(line, strlen(line), tempmask);
-	if (!__cpus_full(&tempmask, num_possible_cpus()))
-		irq->node_mask = tempmask;
-	fclose(file);
-	free(line);
+	struct load_balance_info *info = data;
+
+	info->total_load += obj->load;
+	info->load_sources += 1;
 }

-/*
- * This function classifies and reads various things from /proc about a specific irq 
- */
-static void investigate(struct interrupt *irq, int number)
+static void compute_deviations(struct topo_obj *obj, void *data)
 {
-	DIR *dir;
-	struct dirent *entry;
-	char *c, *c2;
-	int nr , count = 0, can_set = 1;
-	char buf[PATH_MAX];
-	sprintf(buf, "/proc/irq/%i", number);
-	dir = opendir(buf);
-	do {
-		entry = readdir(dir);
-		if (!entry)
-			break;
-		if (strcmp(entry->d_name,"smp_affinity")==0) {
-			char *line = NULL;
-			size_t size = 0;
-			FILE *file;
-			sprintf(buf, "/proc/irq/%i/smp_affinity", number);
-			file = fopen(buf, "r+");
-			if (!file)
-				continue;
-			if (getline(&line, &size, file)==0) {
-				free(line);
-				fclose(file);
-				continue;
-			}
-			cpumask_parse_user(line, strlen(line), irq->mask);
-			/*
-			 * Check that we can write the affinity, if
-			 * not take it out of the list.
-			 */
-			fputs(line, file);
-			if (fclose(file) && errno == EIO)
-				can_set = 0;
-			free(line);
-		} else if (strcmp(entry->d_name,"allowed_affinity")==0) {
-			char *line = NULL;
-			size_t size = 0;
-			FILE *file;
-			sprintf(buf, "/proc/irq/%i/allowed_affinity", number);
-			file = fopen(buf, "r");
-			if (!file)
-				continue;
-			if (getline(&line, &size, file)==0) {
-				free(line);
-				fclose(file);
-				continue;
-			}
-			cpumask_parse_user(line, strlen(line), irq->allowed_mask);
-			fclose(file);
-			free(line);
-		} else if (strcmp(entry->d_name,"affinity_hint")==0) {
-			get_affinity_hint(irq, number);
-		} else {
-			irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS);
-		}
+	struct load_balance_info *info = data;
+	unsigned long long int deviation;

-	} while (entry);
-	closedir(dir);	
-	irq->balance_level = map_class_to_level[irq->class];
+	deviation = (obj->load > info->avg_load) ?
+		obj->load - info->avg_load :
+		info->avg_load - obj->load;

-	for (nr = 0; nr < NR_CPUS; nr++)
-		if (cpu_isset(nr, irq->allowed_mask))
-			count++;
-
-	/* if there is no choice in the allowed mask, don't bother to balance */
-	if ((count<2) || (can_set == 0))
-		 irq->balance_level = BALANCE_NONE;
-		
-
-	/* next, check the IRQBALANCE_BANNED_INTERRUPTS env variable for blacklisted irqs */
-	c = c2 = getenv("IRQBALANCE_BANNED_INTERRUPTS");
-	if (!c)
-		return;
-
-	do {
-		c = c2;
-		nr = strtoul(c, &c2, 10);
-		if (c!=c2 && nr == number)
-			irq->balance_level = BALANCE_NONE;
-	} while (c!=c2 && c2!=NULL);
+	info->deviations += (deviation * deviation);
 }

-/* Set numa node number for MSI interrupt;
- * Assumes existing irq metadata
- */
-void set_msi_interrupt_numa(int number)
+static void move_candidate_irqs(struct irq_info *info, void *data)
 {
-	GList *item;
-	struct interrupt *irq;
-	int node;
+	int *remaining_deviation = (int *)data;

-	node = find_irq_integer_prop(number, IRQ_NUMA);
-	if (node < 0)
-		return;
-
-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-
-		if (irq->number == number) {
-			irq->node_num = node;
-			irq->msi = 1;
+	/* never move an irq that has an afinity hint when 
+ 	 * hint_policy is HINT_POLICY_EXACT 
+ 	 */
+	if (hint_policy == HINT_POLICY_EXACT)
+		if (!cpus_empty(info->affinity_hint))
 			return;
-		}
-		item = g_list_next(item);
-	}
-}

-/*
- * Set the number of interrupts received for a specific irq;
- * create the irq metadata if there is none yet
- */
-void set_interrupt_count(int number, uint64_t count)
-{
-	GList *item;
-	struct interrupt *irq;
-
-	if (count < MIN_IRQ_COUNT && !one_shot_mode)
-		return; /* no need to track or set interrupts sources without any activity since boot
-		 	   but allow for a few (20) boot-time-only interrupts */
-
-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-
-		if (irq->number == number) {
-			irq->count = count;
-			/* see if affinity_hint changed */
-			get_affinity_hint(irq, number);
-			return;
-		}
-		item = g_list_next(item);
-	}
-	/* new interrupt */
-	irq = malloc(sizeof(struct interrupt));
-	if (!irq)
+	/* Don't rebalance irqs that don't want it */
+	if (info->level == BALANCE_NONE)
 		return;
-	memset(irq, 0, sizeof(struct interrupt));
-	irq->node_num = -1;
-	irq->number = number;
-	irq->count = count;
-	irq->allowed_mask = CPU_MASK_ALL;
-	investigate(irq, number);
-	interrupts = g_list_append(interrupts, irq);
+
+	/* Don't move cpus that only have one irq, regardless of load */
+	if (g_list_length(info->assigned_obj->interrupts) <= 1)
+		return;
+
+	/* Stop rebalancing if we've estimated a full reduction of deviation */
+	if (*remaining_deviation <= 0)
+		return;
+
+	*remaining_deviation -= info->load;
+
+	if (debug_mode)
+		printf("Selecting irq %d for rebalancing\n", info->irq);
+
+	migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
+
+	info->assigned_obj = NULL;
 }

-/*
- * Set the numa affinity mask for a specific interrupt if there
- * is metadata for the interrupt; do nothing if no such data
- * exists.
- */
-void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type)
+static void migrate_overloaded_irqs(struct topo_obj *obj, void *data)
 {
-	GList *item;
-	struct interrupt *irq;
+	struct load_balance_info *info = data;
+	int deviation;

-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-		item = g_list_next(item);
+	/*
+ 	 * Don't rebalance irqs on objects whos load is below the average
+ 	 */
+	if (obj->load <= info->avg_load) {
+		if ((obj->load + info->std_deviation) <= info->avg_load) {
+			info->num_under++;
+			info->powersave = obj;
+		} else
+			info->num_within++; 
+		return;
+	}

-		if (irq->number == number) {
-			cpus_or(irq->numa_mask, irq->numa_mask, mask);
-			irq->node_num = node_num;
-			if (irq->class < type && irq->balance_level != BALANCE_NONE)  {
-				irq->class = type;
-				irq->balance_level = map_class_to_level[irq->class];
-			}
-			return;
+	deviation = obj->load - info->avg_load;
+
+	if ((deviation > info->std_deviation) &&
+	    (g_list_length(obj->interrupts) > 1)) {
+
+		info->num_over++;
+		/*
+ 		 * We have a cpu that is overloaded and 
+ 		 * has irqs that can be moved to fix that
+ 		 */
+
+		/* order the list from least to greatest workload */
+		sort_irq_list(&obj->interrupts);
+		/*
+ 		 * Each irq carries a weighted average amount of load
+ 		 * we think its responsible for.  Set deviation to be the load
+ 		 * of the difference between this objects load and the averate,
+ 		 * and migrate irqs until we only have one left, or until that
+ 		 * difference reaches zero
+ 		 */
+		for_each_irq(obj->interrupts, move_candidate_irqs, &deviation);
+	} else
+		info->num_within++;
+
+}
+
+static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused)))
+{
+	migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
+}
+
+static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused)))
+{
+	obj->powersave_mode = 0;
+}
+
+#define find_overloaded_objs(name, info) do {\
+	int ___load_sources;\
+	memset(&(info), 0, sizeof(struct load_balance_info));\
+	for_each_object((name), gather_load_stats, &(info));\
+	(info).avg_load = (info).total_load / (info).load_sources;\
+	for_each_object((name), compute_deviations, &(info));\
+	___load_sources = ((info).load_sources == 1) ? 1 : ((info).load_sources - 1);\
+	(info).std_deviation = (long double)((info).deviations / ___load_sources);\
+	(info).std_deviation = sqrt((info).std_deviation);\
+	for_each_object((name), migrate_overloaded_irqs, &(info));\
+}while(0)
+
+void update_migration_status(void)
+{
+	struct load_balance_info info;
+
+	find_overloaded_objs(cpus, info);
+	if (cycle_count > 5) {
+		if (!info.num_over && (info.num_under >= power_thresh)) {
+			syslog(LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number);
+			info.powersave->powersave_mode = 1;
+			for_each_irq(info.powersave->interrupts, force_irq_migration, NULL);
+		} else if (info.num_over) {
+			syslog(LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n");
+			for_each_object(cpus, clear_powersave_mode, NULL);
 		}
 	}
+	find_overloaded_objs(cache_domains, info);
+	find_overloaded_objs(packages, info);
+	find_overloaded_objs(numa_nodes, info);
 }

-void calculate_workload(void)
+
+static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused)))
 {
-	int i;
-	GList *item;
-	struct interrupt *irq;
-
-	for (i=0; i<7; i++)
-		class_counts[i]=0;
-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-		item = g_list_next(item);
-
-		irq->workload = irq->count - irq->old_count + irq->workload/3 + irq->extra;
-		class_counts[irq->class]++;
-		irq->old_count = irq->count;
-		irq->extra = 0;
-	}
+	info->last_irq_count = info->irq_count;
+	info->irq_count = 0;
 }

 void reset_counts(void)
 {
-	GList *item;
-	struct interrupt *irq;
-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-		item = g_list_next(item);
-		irq->old_count = irq->count;
-		irq->extra = 0;
+	for_each_irq(NULL, reset_irq_count, NULL);
+}

-	}
+
+static void dump_workload(struct irq_info *info, void *unused __attribute__((unused)))
+{
+	printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load);
 }

 void dump_workloads(void)
 {
-	GList *item;
-	struct interrupt *irq;
-	item = g_list_first(interrupts);
-	while (item) {
-		irq = item->data;
-		item = g_list_next(item);
-
-		printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload);
-
-	}
+	for_each_irq(NULL, dump_workload, NULL);
 }

-
-static gint sort_irqs(gconstpointer A, gconstpointer B)
-{
-	struct interrupt *a, *b;
-	a = (struct interrupt*)A;
-	b = (struct interrupt*)B;
-
-	if (a->class < b->class)
-		return 1;
-	if (a->class > b->class)
-		return -1;
-	if (a->workload < b->workload)
-		return 1;
-	if (a->workload > b->workload)
-		return -1;
-	if (a<b)
-		return 1;
-	return -1;
-	
-}
-
-void sort_irq_list(void)
-{
-	/* sort by class first (high->low) and then by workload (high->low) */
-	interrupts = g_list_sort(interrupts, sort_irqs);
-}
--- a/numa.c
+++ b/numa.c
@ -33,24 +33,130 @@

 #include "irqbalance.h"

-void pci_numa_scan(void)
+#define SYSFS_NODE_PATH "/sys/devices/system/node"
+
+GList *numa_nodes = NULL;
+
+struct topo_obj unspecified_node = {
+	.load = 0,
+	.number = -1,
+	.obj_type = OBJ_TYPE_NODE,
+	.mask = CPU_MASK_ALL,
+	.interrupts = NULL,
+	.children = NULL,
+	.parent = NULL,
+	.obj_type_list = &numa_nodes,
+};
+
+static void add_one_node(const char *nodename)
 {
-	int irq = -1;
-	cpumask_t mask;
-	int node_num;
-	do {
-		int type;
-		irq = get_next_irq(irq);
-		if (irq == -1)
-			break;
+	char *path = alloca(strlen(SYSFS_NODE_PATH) + strlen(nodename) + 1);
+	struct topo_obj *new;
+	char *cpustr;
+	FILE *f;

-		mask = find_irq_cpumask_prop(irq, IRQ_LCPU_MASK);
-
-		node_num = find_irq_integer_prop(irq, IRQ_NUMA);
-
-		type = find_irq_integer_prop(irq, IRQ_CLASS);
-
-		add_interrupt_numa(irq, mask, node_num, type);
-		
-	} while (irq != -1);
+	if (!path)
+		return;
+	new = calloc(1, sizeof(struct topo_obj));
+	if (!new)
+		return;
+	sprintf(path, "%s/%s/cpumap", SYSFS_NODE_PATH, nodename);
+	f = fopen(path, "r");
+	if (ferror(f)) {
+		cpus_clear(new->mask);
+	} else {
+		fscanf(f, "%as", &cpustr);
+		if (!cpustr) {
+			cpus_clear(new->mask);
+		} else {
+			cpumask_parse_user(cpustr, strlen(cpustr), new->mask);
+			free(cpustr);
+		}
+	}
+	new->obj_type = OBJ_TYPE_NODE;	
+	new->number = strtoul(&nodename[4], NULL, 10);
+	new->obj_type_list = &numa_nodes;
+	numa_nodes = g_list_append(numa_nodes, new);
 }
+
+void build_numa_node_list(void)
+{
+	DIR *dir = opendir(SYSFS_NODE_PATH);
+	struct dirent *entry;
+
+	do {
+		entry = readdir(dir);
+		if (!entry)
+			break;
+		if ((entry->d_type == DT_DIR) && (strstr(entry->d_name, "node"))) {
+			add_one_node(entry->d_name);
+		}
+	} while (entry);
+}
+
+static void free_numa_node(gpointer data)
+{
+	free(data);
+}
+
+void free_numa_node_list(void)
+{
+	g_list_free_full(numa_nodes, free_numa_node);
+	numa_nodes = NULL;
+}
+
+static gint compare_node(gconstpointer a, gconstpointer b)
+{
+	const struct topo_obj *ai = a;
+	const struct topo_obj *bi = b;
+
+	return (ai->number == bi->number) ? 0 : 1;
+}
+
+void add_package_to_node(struct topo_obj *p, int nodeid)
+{
+	struct topo_obj find, *node;
+	find.number = nodeid;
+	GList *entry;
+
+	find.number = nodeid;
+	entry = g_list_find_custom(numa_nodes, &find, compare_node);
+
+	if (!entry) {
+		if (debug_mode)
+			printf("Could not find numa node for node id %d\n", nodeid);
+		return;
+	}
+
+	node = entry->data;
+
+	if (!p->parent) {
+		node->children = g_list_append(node->children, p);
+		p->parent = node;
+	}
+}
+
+void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused)))
+{
+	char buffer[4096];
+
+	printf("NUMA NODE NUMBER: %d\n", d->number);
+	cpumask_scnprintf(buffer, 4096, d->mask); 
+	printf("LOCAL CPU MASK: %s\n", buffer);
+	printf("\n");
+}
+
+struct topo_obj *get_numa_node(int nodeid)
+{
+	struct topo_obj find;
+	GList *entry;
+
+	if (nodeid == -1)
+		return &unspecified_node;
+
+	find.number = nodeid;
+
+	entry = g_list_find_custom(numa_nodes, &find, compare_node);
+	return entry ? entry->data : NULL;
+}
+
--- a/placement.c
+++ b/placement.c
@ -30,355 +30,167 @@

 int power_mode;

-extern GList *interrupts, *packages, *cache_domains, *cpus;
+GList *rebalance_irq_list;

-static uint64_t package_cost_func(struct interrupt *irq, struct package *package)
+struct obj_placement {
+		struct topo_obj *best;
+		struct topo_obj *least_irqs;
+		uint64_t best_cost;
+		struct irq_info *info;
+};
+
+static void find_best_object(struct topo_obj *d, void *data)
 {
-	int bonus = 0;
-	int maxcount;
-	int dist;
-	/* moving to a cold package/cache/etc gets you a 3000 penalty */
-	if (!cpus_intersects(irq->old_mask, package->mask))
-		bonus = CROSS_PACKAGE_PENALTY;
+	struct obj_placement *best = (struct obj_placement *)data;
+	uint64_t newload;
+	cpumask_t subset;

-	/* do a little numa affinity */
-	if (irq->node_num != package->node_num) {
-		if (irq->node_num >= 0 && package->node_num >= 0) {
-			dist = numa_distance(irq->node_num, package->node_num);
-			/* moving to a distant numa node results into penalty */
-			bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
+	/*
+ 	 * If the hint policy is subset, then we only want 
+ 	 * to consider objects that are within the irqs hint, but
+ 	 * only if that irq in fact has published a hint
+ 	 */
+	if (hint_policy == HINT_POLICY_SUBSET) {
+		if (!cpus_empty(best->info->affinity_hint)) {
+			cpus_and(subset, best->info->affinity_hint, d->mask);
+			if (cpus_empty(subset))
+				return;
 		}
 	}

-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
+	if (d->powersave_mode)
+		return;

-	/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
-	if (power_mode)
-		bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number;
-
-	/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
-	maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
-	if (package->class_count[irq->class]>=maxcount && !power_mode)
-		bonus += 300000;
-
-	/* if the package has no cpus in the allowed mask.. just block */
-	if (!cpus_intersects(irq->allowed_mask, package->mask))
-		bonus += 600000;
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
-{
-	int bonus = 0;
-	int dist;
-
-	/* moving to a cold cache gets you a 1500 penalty */
-	if (!cpus_intersects(irq->old_mask, cache_domain->mask))
-		bonus = CROSS_PACKAGE_PENALTY/2;
-
-	/* do a little numa affinity */
-	if (irq->node_num != cache_domain->node_num) {
-		if (irq->node_num >= 0 && cache_domain->node_num >= 0) {
-			dist = numa_distance(irq->node_num, cache_domain->node_num);
-			/* moving to a distant numa node results into penalty */
-			bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
-		}
+	newload = d->load;
+	if (newload < best->best_cost) {
+		best->best = d;
+		best->best_cost = newload;
+		best->least_irqs = NULL;
 	}

-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
-
-	/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
-	 * cache domain */
-	if (irq->msi == 1) 
-		bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
-
-	/* if the cache domain has no cpus in the allowed mask.. just block */
-	if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
-		bonus += 600000;
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
-{
-	int bonus = 0;
-	int dist;
-
-	/* moving to a colder core gets you a 1000 penalty */
-	if (!cpus_intersects(irq->old_mask, cpu->mask))
-		bonus = CROSS_PACKAGE_PENALTY/3;
-
-	/* do a little numa affinity */
-	if (irq->node_num != cpu->node_num) {
-		if (irq->node_num >= 0 && cpu->node_num >= 0) {
-			dist = numa_distance(irq->node_num, cpu->node_num);
-			/* moving to a distant numa node results into penalty */
-			bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
-		}
-	}
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-	/* 
-	 * since some chipsets only place at the first cpu, give a tiny preference to non-first
-	 * cpus for specifically placed interrupts 
-	 */
-	if (first_cpu(cpu->cache_mask)==cpu->number)
-		bonus++;
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
-
-	/* if the core  has no cpus in the allowed mask.. just block */
-	if (!cpus_intersects(irq->allowed_mask, cpu->mask))
-		bonus += 600000;
-
-	return irq->workload + bonus;
-}
-
-
-static void place_cache_domain(struct package *package)
-{
-	GList *iter, *next;
-	GList *pkg;
-	struct interrupt *irq;
-	struct cache_domain *cache_domain;
-
-
-	iter = g_list_first(package->interrupts);
-	while (iter) {
-		struct cache_domain *best = NULL;
-		uint64_t best_cost = INT_MAX;
-		irq = iter->data;
-
-		if (irq->balance_level <= BALANCE_PACKAGE) {
-			iter = g_list_next(iter);
-			continue;
-		}
-		pkg = g_list_first(package->cache_domains);
-		while (pkg) {
-			uint64_t newload;
-
-			cache_domain = pkg->data;
-			newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain);
-			if (newload < best_cost)  {
-				best = cache_domain;
-				best_cost = newload;
-			}
-
-			pkg = g_list_next(pkg);
-		}
-		if (best) {
-			next = g_list_next(iter);
-			package->interrupts = g_list_delete_link(package->interrupts, iter);
-			
-			best->workload += irq->workload + 1;
-			best->interrupts=g_list_append(best->interrupts, irq);
-			best->class_count[irq->class]++;
-			irq->mask = best->mask;
-			iter = next;
-		} else
-			iter = g_list_next(iter);
+	if (newload == best->best_cost) {
+		if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts))
+			best->least_irqs = d;
 	}
 }

-
-static void place_core(struct cache_domain *cache_domain)
+static void find_best_object_for_irq(struct irq_info *info, void *data)
 {
-	GList *iter, *next;
-	GList *pkg;
-	struct interrupt *irq;
-	struct cpu_core *cpu;
+	struct obj_placement place;
+	struct topo_obj *d = data;
+	struct topo_obj *asign;

+	if (!info->moved)
+		return;

-	iter = g_list_first(cache_domain->interrupts);
-	while (iter) {
-		struct cpu_core *best = NULL;
-		uint64_t best_cost = INT_MAX;
-		irq = iter->data;
+	switch (d->obj_type) {
+	case OBJ_TYPE_NODE:
+		if (info->level == BALANCE_NONE)
+			return;
+		break;

-		/* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */
-		if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) {
-			iter = g_list_next(iter);
-			continue;
-		}
-		pkg = g_list_first(cache_domain->cpu_cores);
-		while (pkg) {
-			uint64_t newload;
+	case OBJ_TYPE_PACKAGE:
+		if (info->level == BALANCE_PACKAGE)
+			return;
+		break;

-			cpu = pkg->data;
-			newload = cpu->workload + cpu_cost_func(irq, cpu);
-			if (newload < best_cost)  {
-				best = cpu;
-				best_cost = newload;
-			}
+	case OBJ_TYPE_CACHE:
+		if (info->level == BALANCE_CACHE)
+			return;
+		break;

-			pkg = g_list_next(pkg);
-		}
-		if (best) {
-			next = g_list_next(iter);
-			cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter);
-			
-			best->workload += irq->workload + 1;
-			best->interrupts=g_list_append(best->interrupts, irq);
-			best->class_count[irq->class]++;
-			irq->mask = best->mask;
-			iter = next;
-		} else
-			iter = g_list_next(iter);
+	case OBJ_TYPE_CPU:
+		if (info->level == BALANCE_CORE)
+			return;
+		break;
+	}
+
+	place.info = info;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.best_cost = INT_MAX;
+
+	for_each_object(d->children, find_best_object, &place);
+
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&d->interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }

-
-static void place_packages(GList *list)
+static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused)))
 {
-	GList *iter;
-	GList *pkg;
-	struct interrupt *irq;
-	struct package *package;
+	if (g_list_length(d->interrupts) > 0)
+		for_each_irq(d->interrupts, find_best_object_for_irq, d);
+}

+static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused)))
+{
+	struct obj_placement place;
+	struct topo_obj *asign;

-	iter = g_list_first(list);
-	while (iter) {
-		struct package *best = NULL;
-		uint64_t best_cost = INT_MAX;
-		irq = iter->data;
-		if (irq->balance_level == BALANCE_NONE) {
-			iter = g_list_next(iter);
-			continue;
-		}
-		pkg = g_list_first(packages);
-		while (pkg) {
-			uint64_t newload;
+	if( info->level == BALANCE_NONE)
+		return;

-			package = pkg->data;
-			newload = package->workload + package_cost_func(irq, package);
-			if (newload < best_cost)  {
-				best = package;
-				best_cost = newload;
-			}
+	if (irq_numa_node(info)->number != -1) {
+		/*
+ 		 * This irq belongs to a device with a preferred numa node
+ 		 * put it on that node
+ 		 */
+		migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info);
+		info->assigned_obj = irq_numa_node(info);
+		irq_numa_node(info)->load += info->load + 1;
+		return;
+	}

-			pkg = g_list_next(pkg);
-		}
-		if (best) {
-			best->workload += irq->workload + 1;
-			best->interrupts=g_list_append(best->interrupts, irq);
-			best->class_count[irq->class]++;
-			irq->mask = best->mask;
-		}
-		iter = g_list_next(iter);
+	place.best_cost = INT_MAX;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.info = info;
+
+	for_each_object(numa_nodes, find_best_object, &place);
+
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&rebalance_irq_list, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }

-
-static void place_affinity_hint(GList *list)
+static void validate_irq(struct irq_info *info, void *data)
 {
-	/* still need to balance best workload within the affinity_hint mask */
-	GList *iter;
-	struct interrupt *irq;
-
-	iter = g_list_first(list);
-	while (iter) {
-		irq = iter->data;
-		if (irq->balance_level == BALANCE_NONE) {
-			iter = g_list_next(iter);
-			continue;
-		}
-		if ((!cpus_empty(irq->node_mask)) &&
-		    (!cpus_equal(irq->mask, irq->node_mask)) &&
-		    (!__cpus_full(&irq->node_mask, num_possible_cpus()))) {
-			irq->old_mask = irq->mask;
-			irq->mask = irq->node_mask;
-		}
-
-		iter = g_list_next(iter);
-	}
+	if (info->assigned_obj != data)
+		printf("object validation error: irq %d is wrong, points to %p, should be %p\n",
+			info->irq, info->assigned_obj, data);
 }

-
-static void do_unroutables(void)
+static void validate_object(struct topo_obj *d, void *data __attribute__((unused)))
 {
-	struct package *package;
-	struct cache_domain *cache_domain;
-	struct cpu_core *cpu;
-	struct interrupt *irq;
-	GList *iter, *inter;
-
-	inter = g_list_first(interrupts);
-	while (inter) {
-		irq = inter->data;
-		inter = g_list_next(inter);
-		if (irq->balance_level != BALANCE_NONE)
-			continue;
-
-		iter = g_list_first(packages);
-		while (iter) {
-			package = iter->data;
-			if (cpus_intersects(package->mask, irq->node_mask) ||
-			    cpus_intersects(package->mask, irq->mask))
-				package->workload += irq->workload;
-			iter = g_list_next(iter);
-		}
-
-		iter = g_list_first(cache_domains);
-		while (iter) {
-			cache_domain = iter->data;
-			if (cpus_intersects(cache_domain->mask, irq->node_mask)
-			    || cpus_intersects(cache_domain->mask, irq->mask))
-				cache_domain->workload += irq->workload;
-			iter = g_list_next(iter);
-		}
-		iter = g_list_first(cpus);
-		while (iter) {
-			cpu = iter->data;
-			if (cpus_intersects(cpu->mask, irq->node_mask) ||
-			    cpus_intersects(cpu->mask, irq->mask))
-				cpu->workload += irq->workload;
-			iter = g_list_next(iter);
-		}
-	}
+	if (d->interrupts)
+		for_each_irq(d->interrupts, validate_irq, d);
 }

+static void validate_object_tree_placement()
+{
+	for_each_object(packages, validate_object, NULL);	
+	for_each_object(cache_domains, validate_object, NULL);
+	for_each_object(cpus, validate_object, NULL);
+}

 void calculate_placement(void)
 {
-	struct package *package;
-	struct cache_domain *cache_domain;
-	GList *iter;
-	/* first clear old data */ 
-	clear_work_stats();
-	sort_irq_list();
-	do_unroutables();
-
-	place_packages(interrupts);
-	iter = g_list_first(packages);
-	while (iter) {
-		package = iter->data;
-		place_cache_domain(package);
-		iter = g_list_next(iter);
+	sort_irq_list(&rebalance_irq_list);
+	if (g_list_length(rebalance_irq_list) > 0) {
+		for_each_irq(rebalance_irq_list, place_irq_in_node, NULL);
+		for_each_object(numa_nodes, place_irq_in_object, NULL);
+		for_each_object(packages, place_irq_in_object, NULL);
+		for_each_object(cache_domains, place_irq_in_object, NULL);
 	}
-
-	iter = g_list_first(cache_domains);
-	while (iter) {
-		cache_domain = iter->data;
-		place_core(cache_domain);
-		iter = g_list_next(iter);
-	}
-	/*
-	 * if affinity_hint is populated on irq and is not set to
-	 * all CPUs (meaning it's initialized), honor that above
-	 * anything in the package locality/workload.
-	 */
-	place_affinity_hint(interrupts);
+	if (debug_mode)
+		validate_object_tree_placement();
 }
--- a/powermode.c
+++ b/powermode.c
@ -28,54 +28,7 @@
 #include "irqbalance.h"


-extern int power_mode;
-
-static uint64_t previous;
-
-static unsigned int hysteresis;
-
 void check_power_mode(void)
 {
-	FILE *file;
-	char *line = NULL;
-	size_t size = 0;
-	char *c;
-	uint64_t dummy __attribute__((unused));
-	uint64_t irq, softirq;
-	file = fopen("/proc/stat", "r");
-	if (!file)
-		return;
-	if (getline(&line, &size, file)==0)
-		size=0;
-	fclose(file);
-	if (!line)
-		return;
-	c=&line[4];
-	dummy = strtoull(c, &c, 10); /* user */
-	dummy = strtoull(c, &c, 10); /* nice */
-	dummy = strtoull(c, &c, 10); /* system */
-	dummy = strtoull(c, &c, 10); /* idle */
-	dummy = strtoull(c, &c, 10); /* iowait */
-	irq = strtoull(c, &c, 10); /* irq */
-	softirq = strtoull(c, &c, 10); /* softirq */
-
-
-	irq += softirq;
-	printf("IRQ delta is %lu \n", (unsigned long)(irq - previous) );
-	if (irq - previous <  POWER_MODE_SOFTIRQ_THRESHOLD)  {
-		hysteresis++;
-		if (hysteresis > POWER_MODE_HYSTERESIS) {
-			if (debug_mode && !power_mode)
-				printf("IRQ delta is %lu, switching to power mode \n", (unsigned long)(irq - previous) );
-			power_mode = 1;
-		}
-	} else {
-		if (debug_mode && power_mode)
-			printf("IRQ delta is %lu, switching to performance mode \n", (unsigned long)(irq - previous) );
-		power_mode = 0;
-		hysteresis = 0;
-	}
-	previous = irq;
-	free(line);
 }

--- a/procinterrupts.c
+++ b/procinterrupts.c
@ -25,6 +25,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <syslog.h>
+#include <ctype.h>

 #include "cpumask.h"
 #include "irqbalance.h"
@ -39,7 +40,6 @@ void parse_proc_interrupts(void)
 	FILE *file;
 	char *line = NULL;
 	size_t size = 0;
-	int int_type;

 	file = fopen("/proc/interrupts", "r");
 	if (!file)
@ -48,6 +48,7 @@ void parse_proc_interrupts(void)
 	/* first line is the header we don't need; nuke it */
 	if (getline(&line, &size, file)==0) {
 		free(line);
+		fclose(file);
 		return;
 	}

@ -56,6 +57,7 @@ void parse_proc_interrupts(void)
 		int	 number;
 		uint64_t count;
 		char *c, *c2;
+		struct irq_info *info;

 		if (getline(&line, &size, file)==0)
 			break;
@ -65,7 +67,11 @@ void parse_proc_interrupts(void)
 				proc_int_has_msi = 1;

 		/* lines with letters in front are special, like NMI count. Ignore */
-		if (!(line[0]==' ' || (line[0]>='0' && line[0]<='9')))
+		c = line;
+		while (isblank(*(c)))
+			c++;
+			
+		if (!(*c>='0' && *c<='9'))
 			break;
 		c = strchr(line, ':');
 		if (!c)
@ -73,6 +79,10 @@ void parse_proc_interrupts(void)
 		*c = 0;
 		c++;
 		number = strtoul(line, NULL, 10);
+		info = get_irq_info(number);
+		if (!info)
+			info = add_misc_irq(number);
+
 		count = 0;
 		cpunr = 0;

@ -88,18 +98,13 @@ void parse_proc_interrupts(void)
 		}
 		if (cpunr != core_count) 
 			need_cpu_rescan = 1;
-		
-		set_interrupt_count(number, count);
+
+		info->last_irq_count = info->irq_count;		
+		info->irq_count = count;

 		/* is interrupt MSI based? */
-		int_type = find_irq_integer_prop(number, IRQ_TYPE);
-		if ((int_type == IRQ_TYPE_MSI) || (int_type == IRQ_TYPE_MSIX)) {
+		if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX))
 			msi_found_in_sysfs = 1;
-			/* Set numa node for irq if it was MSI */
-			if (debug_mode)
-				printf("Set MSI interrupt for %d\n", number);
-			set_msi_interrupt_numa(number);
-		}
 	}		
 	if ((proc_int_has_msi) && (!msi_found_in_sysfs)) {
 		syslog(LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n");
@ -113,3 +118,138 @@ void parse_proc_interrupts(void)
 	fclose(file);
 	free(line);
 }
+
+
+static void accumulate_irq_count(struct irq_info *info, void *data)
+{
+	uint64_t *acc = data;
+
+	*acc += (info->irq_count - info->last_irq_count);
+}
+
+static void assign_load_slice(struct irq_info *info, void *data)
+{
+	uint64_t *load_slice = data;
+	info->load = (info->irq_count - info->last_irq_count) * *load_slice;
+
+	/*
+ 	 * Every IRQ has at least a load of 1
+ 	 */
+	if (!info->load)
+		info->load++;
+}
+
+/*
+ * Recursive helper to estimate the number of irqs shared between 
+ * multiple topology objects that was handled by this particular object
+ */
+static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d)
+{
+	uint64_t total_irq_count = 0;
+
+	if (d->parent) {
+		total_irq_count = get_parent_branch_irq_count_share(d->parent);
+		total_irq_count /= g_list_length(*d->obj_type_list);
+	}
+
+	if (g_list_length(d->interrupts) > 0)
+		for_each_irq(d->interrupts, accumulate_irq_count, &total_irq_count);
+
+	return total_irq_count;
+}
+
+static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused)))
+{
+	uint64_t local_irq_counts = 0;
+	uint64_t load_slice;
+	int	load_divisor = g_list_length(d->children);
+
+	d->load /= (load_divisor ? load_divisor : 1);
+
+	if (g_list_length(d->interrupts) > 0) {
+		local_irq_counts = get_parent_branch_irq_count_share(d);
+		load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1;
+		for_each_irq(d->interrupts, assign_load_slice, &load_slice);
+	}
+
+	if (d->parent)
+		d->parent->load += d->load;
+}
+
+void parse_proc_stat()
+{
+	FILE *file;
+	char *line = NULL;
+	size_t size = 0;
+	int cpunr, rc, cpucount;
+	struct topo_obj *cpu;
+	int irq_load, softirq_load;
+
+	file = fopen("/proc/stat", "r");
+	if (!file) {
+		syslog(LOG_WARNING, "WARNING cant open /proc/stat.  balacing is broken\n");
+		return;
+	}
+
+	/* first line is the header we don't need; nuke it */
+	if (getline(&line, &size, file)==0) {
+		free(line);
+		syslog(LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n");
+		fclose(file);
+		return;
+        }
+
+	cpucount = 0;
+	while (!feof(file)) {
+		if (getline(&line, &size, file)==0)
+                        break;
+
+		if (!strstr(line, "cpu"))
+			break;
+
+		cpunr = strtoul(&line[3], NULL, 10);
+
+		rc = sscanf(line, "%*s %*d %*d %*d %*d %*d %d %d", &irq_load, &softirq_load);
+		if (rc < 2)
+			break;	
+
+		cpu = find_cpu_core(cpunr);
+
+		if (!cpu)
+			break;
+
+		cpucount++;
+
+		/*
+ 		 * For each cpu add the irq and softirq load and propagate that
+ 		 * all the way up the device tree
+ 		 */
+		if (cycle_count) {
+			cpu->load = (irq_load + softirq_load) - (cpu->last_load);
+			/*
+			 * the [soft]irq_load values are in jiffies, which are
+			 * units of 10ms, multiply by 1000 to convert that to
+			 * 1/10 milliseconds.  This give us a better integer
+			 * distribution of load between irqs
+			 */
+			cpu->load *= 1000;
+		}
+		cpu->last_load = (irq_load + softirq_load);
+	}
+
+	fclose(file);
+	if (cpucount != get_cpu_count()) {
+		syslog(LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n");
+		return;
+	}
+
+	/*
+ 	 * Now that we have load for each cpu attribute a fair share of the load
+ 	 * to each irq on that cpu
+ 	 */
+	for_each_object(cpus, compute_irq_branch_load_share, NULL);
+	for_each_object(cache_domains, compute_irq_branch_load_share, NULL);
+	for_each_object(packages, compute_irq_branch_load_share, NULL);
+	for_each_object(numa_nodes, compute_irq_branch_load_share, NULL);
+
+}
--- a/types.h
+++ b/types.h
@ -26,89 +26,39 @@
 #define IRQ_TYPE_MSI	1
 #define IRQ_TYPE_MSIX	2

-
-/*
- * IRQ properties
- */
-enum irq_prop {
-	IRQ_CLASS = 0,
-	IRQ_TYPE,
-	IRQ_NUMA,
-	IRQ_LCPU_MASK,
-	IRQ_MAX_PROPERTY
+enum obj_type_e {
+	OBJ_TYPE_CPU,
+	OBJ_TYPE_CACHE,
+	OBJ_TYPE_PACKAGE,
+	OBJ_TYPE_NODE
 };

-struct package {
-	uint64_t	workload;
-	int	number;
-
-	cpumask_t	mask;
-	int	node_num;
-
-	int class_count[7];
-
-	GList	*cache_domains;
-	GList 	*interrupts;
+struct topo_obj {
+	uint64_t load;
+	uint64_t last_load;
+	enum obj_type_e obj_type;
+	int number;
+	int powersave_mode;
+	cpumask_t mask;
+	GList *interrupts;
+	struct topo_obj *parent;
+	GList *children;
+	GList **obj_type_list;
 };

-struct cache_domain {
-	uint64_t	workload;
-	int	number;
-
-	int marker;
-	int	node_num;
-
-	cpumask_t	mask;
-
-	cpumask_t	package_mask;
-
-	int class_count[7];
-
-	GList	*cpu_cores;
-	GList 	*interrupts;
+struct irq_info {
+        int irq;
+        int class;
+        int type;
+	int level;
+        struct topo_obj *numa_node;
+        cpumask_t cpumask;
+        cpumask_t affinity_hint;
+        uint64_t irq_count;
+        uint64_t last_irq_count;
+	uint64_t load;
+        int moved;
+        struct topo_obj *assigned_obj;
 };

-
-struct cpu_core {
-	uint64_t	workload;
-	int	number;
-
-	int	marker;
-	int	node_num;
-
-	int class_count[7];
-
-	cpumask_t	package_mask;
-	cpumask_t	cache_mask;
-	cpumask_t	mask;
-
-	GList 	*interrupts;
-};
-
-struct interrupt {
-	uint64_t	workload;
-
-	int	balance_level;
-
-	int	number;
-	int	class;
-	int	node_num;
-	int	msi;
-
-	uint64_t	count;
-	uint64_t	old_count;
-	uint64_t	extra;
-
-	cpumask_t	mask;
-	cpumask_t	old_mask;
-	
-
-	cpumask_t	numa_mask;
-	cpumask_t	allowed_mask;
-
-	/* user/driver provided for smarter balancing */
-	cpumask_t	node_mask;
-};
-
-
 #endif