From 93f959c9a6755ad6a5685e9e588a9d24375e539b Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 6 Oct 2011 11:11:58 -0400
Subject: [PATCH] Cut over to base irq placement using new algorithm

This is the big move.  The main loop now uses the new balance alg based on
standard deviation away from the average softirq+irq time as read from
/proc/stat.  Initial results look good.

Also cleaned out old data from previous algorithm, so we don't have any dangling
mess
---
 classify.c       |   4 +-
 cputree.c        |  16 +---
 irqbalance.c     |  21 ++---
 irqbalance.h     |   2 +-
 irqlist.c        |  46 +++-------
 numa.c           |   2 +-
 placement.c      | 223 +++++++++++++----------------------------------
 procinterrupts.c |   5 +-
 types.h          |   5 --
 9 files changed, 90 insertions(+), 234 deletions(-)

diff --git a/classify.c b/classify.c
index 5c6a58a..be3efa5 100644
--- a/classify.c
+++ b/classify.c
@@ -336,9 +336,9 @@ static gint sort_irqs(gconstpointer A, gconstpointer B)
 		return 1;
 	if (a->class > b->class)
 		return -1;
-	if (a->workload < b->workload)
+	if (a->load < b->load)
 		return 1;
-	if (a->workload > b->workload)
+	if (a->load > b->load)
 		return -1;
 	if (a<b)
 		return 1;
diff --git a/cputree.c b/cputree.c
index ec2315c..2d7787c 100644
--- a/cputree.c
+++ b/cputree.c
@@ -255,13 +255,13 @@ static void dump_irq(struct irq_info *info, void *data)
 	int spaces = (long int)data;
 	int i;
 	for (i=0; i<spaces; i++) printf(" ");
-	printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->workload);
+	printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->load);
 }
 
 static void dump_cpu_core(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cpu_core *c = (struct cpu_core *)d;
-	printf("                CPU number %i  numa_node is %d (workload %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.workload);
+	printf("                CPU number %i  numa_node is %d (load %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.load);
 	if (c->common.interrupts)
 		for_each_irq(c->common.interrupts, dump_irq, (void *)18);
 }
@@ -271,7 +271,7 @@ static void dump_cache_domain(struct common_obj_data *d, void *data)
 	struct cache_domain *c = (struct cache_domain *)d;
 	char *buffer = data;
 	cpumask_scnprintf(buffer, 4095, c->common.mask);
-	printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (workload %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.workload);
+	printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (load %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.load);
 	if (c->cpu_cores)
 		for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL);
 	if (c->common.interrupts)
@@ -283,7 +283,7 @@ static void dump_package(struct common_obj_data *d, void *data)
 	struct package *p = (struct package *)d;
 	char *buffer = data;
 	cpumask_scnprintf(buffer, 4096, p->common.mask);
-	printf("Package %i:  numa_node is %d cpu mask is %s (workload %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.workload);
+	printf("Package %i:  numa_node is %d cpu mask is %s (load %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.load);
 	if (p->cache_domains)
 		for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer);
 	if (p->common.interrupts)
@@ -299,8 +299,6 @@ void dump_tree(void)
 static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cpu_core *c = (struct cpu_core *)d;
-	memset(c->class_count, 0, sizeof(c->class_count));
-	c->common.workload = 0;
 	c->common.load = 0;
 	c->irq_load = 0;
 	c->softirq_load = 0;
@@ -309,8 +307,6 @@ static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__(
 static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cache_domain *c = (struct cache_domain *)d;
-	memset(c->class_count, 0, sizeof(c->class_count));
-	c->common.workload = 0;
 	c->common.load = 0;
 	for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL);
 }
@@ -318,8 +314,6 @@ static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((
 static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct package *p = (struct package *)d;
-	memset(p->class_count, 0, sizeof(p->class_count));
-	p->common.workload = 0;
 	p->common.load = 0;
 	for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL);
 }
@@ -327,14 +321,12 @@ static void clear_package_stats(struct common_obj_data *d, void *data __attribut
 static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct numa_node *n = (struct numa_node *)d;
-	n->common.workload = 0;
 	n->common.load = 0;
 	for_each_package(n->packages, clear_package_stats, NULL);
 }
 
 static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused)))
 {
-	info->workload = 0;
 	info->load = 0;
 }
 
diff --git a/irqbalance.c b/irqbalance.c
index 1d0a75f..6607d38 100644
--- a/irqbalance.c
+++ b/irqbalance.c
@@ -136,6 +136,7 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__(
 
 int main(int argc, char** argv)
 {
+	int compute_migration_status=0;
 
 #ifdef HAVE_GETOPT_LONG
 	parse_command_line(argc, argv);
@@ -188,17 +189,7 @@ int main(int argc, char** argv)
 	capng_apply(CAPNG_SELECT_BOTH);
 #endif
 
-	parse_proc_interrupts();
-	parse_proc_stat();
-	sleep(SLEEP_INTERVAL/4);
-	reset_counts();
-	parse_proc_interrupts();
-	calculate_workload();
-	if (debug_mode)
-		dump_workloads();
-
 	for_each_irq(NULL, force_rebalance_irq, NULL);
-	sort_irq_list(&rebalance_irq_list);
 
 	while (1) {
 		sleep_approx(SLEEP_INTERVAL);
@@ -223,9 +214,14 @@ int main(int argc, char** argv)
 			free_object_tree();
 			build_object_tree();
 			for_each_irq(NULL, force_rebalance_irq, NULL);
-		}
+			compute_migration_status=0;
+		} 
+
+		if (compute_migration_status)	
+			update_migration_status();
+		else
+			compute_migration_status=1;
 
-		calculate_workload();
 
 		calculate_placement();
 		activate_mappings();
@@ -236,7 +232,6 @@ int main(int argc, char** argv)
 			break;
 		counter++;
 
-		for_each_irq(NULL, force_rebalance_irq, NULL);
 	}
 	free_object_tree();
 	return EXIT_SUCCESS;
diff --git a/irqbalance.h b/irqbalance.h
index 81e4cf5..1321219 100644
--- a/irqbalance.h
+++ b/irqbalance.h
@@ -32,7 +32,7 @@ extern void set_msi_interrupt_numa(int number);
 
 extern GList *rebalance_irq_list;
 
-void calculate_workload(void);
+void update_migration_status(void);
 void reset_counts(void);
 void dump_workloads(void);
 void sort_irq_list(GList **list);
diff --git a/irqlist.c b/irqlist.c
index e69f220..462f1b8 100644
--- a/irqlist.c
+++ b/irqlist.c
@@ -36,36 +36,6 @@
 
 
 
-void get_affinity_hint(struct irq_info *irq, int number)
-{
-	char buf[PATH_MAX];
-	cpumask_t tempmask;
-	char *line = NULL;
-	size_t size = 0;
-	FILE *file;
-	sprintf(buf, "/proc/irq/%i/affinity_hint", number);
-	file = fopen(buf, "r");
-	if (!file)
-		return;
-	if (getline(&line, &size, file)==0) {
-		free(line);
-		fclose(file);
-		return;
-	}
-	cpumask_parse_user(line, strlen(line), tempmask);
-	if (!__cpus_full(&tempmask, num_possible_cpus()))
-		irq->affinity_hint = tempmask;
-	fclose(file);
-	free(line);
-}
-
-void build_workload(struct irq_info *info, void *unused __attribute__((unused)))
-{
-	info->workload = info->irq_count - info->last_irq_count + info->workload/3;
-	class_counts[info->class]++;
-	info->last_irq_count = info->irq_count;
-}
-
 struct load_balance_info {
 	unsigned long long int total_load;
 	unsigned long long avg_load;
@@ -98,8 +68,15 @@ static void move_candidate_irqs(struct irq_info *info, void *data)
 {
 	int *remaining_deviation = (int *)data;
 
+	/* Don't rebalance irqs that don't want it */
+	if (info->level == BALANCE_NONE)
+		return;
+
+	/* Don't move cpus that only have one irq, regardless of load */
 	if (g_list_length(info->assigned_obj->interrupts) <= 1)
 		return;
+
+	/* Stop rebalancing if we've estimated a full reduction of deviation */
 	if (*remaining_deviation <= 0)
 		return;
 
@@ -155,20 +132,17 @@ static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data)
 	for_each_##name(NULL, migrate_overloaded_irqs, &(info));\
 }while(0)
 
-void calculate_workload(void)
+void update_migration_status(void)
 {
-	int i;
 	struct load_balance_info info;
 
-	for (i=0; i<7; i++)
-		class_counts[i]=0;
-	for_each_irq(NULL, build_workload, NULL);
 	find_overloaded_objs(cpu_core, info);
 	find_overloaded_objs(cache_domain, info);
 	find_overloaded_objs(package, info);
 	find_overloaded_objs(numa_node, info);
 }
 
+
 static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused)))
 {
 	info->last_irq_count = info->irq_count;
@@ -183,7 +157,7 @@ void reset_counts(void)
 
 static void dump_workload(struct irq_info *info, void *unused __attribute__((unused)))
 {
-	printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->workload);
+	printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->load);
 }
 
 void dump_workloads(void)
diff --git a/numa.c b/numa.c
index c239c55..ce3eb67 100644
--- a/numa.c
+++ b/numa.c
@@ -39,7 +39,7 @@ GList *numa_nodes = NULL;
 
 struct numa_node unspecified_node = {
 	.common = {
-		.workload = 0,
+		.load = 0,
 		.number = -1,
 		.mask = CPU_MASK_ALL,
 		.interrupts = NULL,
diff --git a/placement.c b/placement.c
index b6919fa..8f5623a 100644
--- a/placement.c
+++ b/placement.c
@@ -32,92 +32,36 @@ int power_mode;
 
 GList *rebalance_irq_list;
 
-static uint64_t package_cost_func(struct irq_info *irq, struct package *package)
-{
-	int bonus = 0;
-	int maxcount;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-	/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
-	if (power_mode)
-		bonus += POWER_MODE_PACKAGE_THRESHOLD * package->common.number;
-
-	/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
-	maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
-	if (package->class_count[irq->class]>=maxcount && !power_mode)
-		bonus += 300000;
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain *cache_domain)
-{
-	int bonus = 0;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
-
-	/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
-	 * cache domain */
-	if ((irq->type == IRQ_TYPE_MSI) || (irq->type == IRQ_TYPE_MSIX))
-		bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
-
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu)
-{
-	int bonus = 0;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-	/* 
-	 * since some chipsets only place at the first cpu, give a tiny preference to non-first
-	 * cpus for specifically placed interrupts 
-	 */
-	if (first_cpu(cpu_cache_domain(cpu)->common.mask)==cpu->common.number)
-		bonus++;
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
-
-	return irq->workload + bonus;
-}
-
-struct cache_domain_placement {
-	struct irq_info *info;
-	struct cache_domain *best;
-	uint64_t best_cost;
+struct obj_placement {
+		struct common_obj_data *best;
+		struct common_obj_data *least_irqs;
+		uint64_t best_cost;
+		struct irq_info *info;
 };
 
-static void find_best_cd(struct common_obj_data *d, void *data)
+static void find_best_object(struct common_obj_data *d, void *data)
 {
-	struct cache_domain *c = (struct cache_domain *)d;
-	struct cache_domain_placement *best = data;
+	struct obj_placement *best = (struct obj_placement *)data;
 	uint64_t newload;
 
-	newload = c->common.workload + cache_domain_cost_func(best->info, c);
+	newload = d->load;
 	if (newload < best->best_cost) {
-		best->best = c;
+		best->best = d;
 		best->best_cost = newload;
+		best->least_irqs = NULL;
 	}
-}	
+
+	if (newload == best->best_cost) {
+		if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts))
+			best->least_irqs = d;
+	}
+}
 
 static void place_irq_in_cache_domain(struct irq_info *info, void *data)
 {
 	struct package *p = data;
-	struct cache_domain_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;
 
 	if (!info->moved)
 		return;
@@ -125,16 +69,19 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data)
 	if (info->level <= BALANCE_PACKAGE)
 		return;
 
-	place.best_cost = INT_MAX;
-	place.best = NULL;
+
 	place.info = info;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.best_cost = INT_MAX;
 
-	for_each_cache_domain(p->cache_domains, find_best_cd, &place);
+	for_each_cache_domain(p->cache_domains, find_best_object, &place);
 
-	if (place.best) {
-		migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->class_count[info->class]++;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&p->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
 	}
 
 }
@@ -146,30 +93,11 @@ static void place_cache_domain(struct common_obj_data *d, void *data __attribute
 		for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package);
 }
 
-
-struct core_placement {
-	struct cpu_core *best;
-	uint64_t best_cost;
-	struct irq_info *info;
-};
-
-static void place_irq_in_core(struct common_obj_data *d, void *data)
-{
-	struct cpu_core *c = (struct cpu_core *)d;
-	struct core_placement *best = data;
-	uint64_t newload;
-
-	newload = c->common.workload + cpu_cost_func(best->info, c);
-	if (newload < best->best_cost) {
-		best->best = c;
-		best->best_cost = newload;
-	}
-}
-
 static void place_core(struct irq_info *info, void *data)
 {
 	struct cache_domain *c = data;
-	struct core_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;
 
 	if (!info->moved)
 		return;
@@ -180,14 +108,17 @@ static void place_core(struct irq_info *info, void *data)
 
 	place.info = info;
 	place.best = NULL;
+	place.least_irqs = NULL;
 	place.best_cost = INT_MAX;
 
-	for_each_cpu_core(c->cpu_cores, place_irq_in_core, &place);
+	for_each_cpu_core(c->cpu_cores, find_best_object, &place);
 
-	if (place.best) {
-		migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&c->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 
 }
@@ -199,29 +130,11 @@ static void place_cores(struct common_obj_data *d, void *data __attribute__((unu
 		for_each_irq(cache_domain->common.interrupts, place_core, cache_domain);
 }
 
-struct package_placement {
-	struct irq_info *info;
-	struct package *best;
-	uint64_t best_cost;
-};
-
-static void find_best_package(struct common_obj_data *d, void *data)
-{
-	struct package *p = (struct package *)d;
-	uint64_t newload;
-	struct package_placement *place = data;
-
-	newload = p->common.workload + package_cost_func(place->info, p);
-	if (newload < place->best_cost) {
-		place->best = p;
-		place->best_cost = newload;
-	}
-}
-
 static void place_irq_in_package(struct irq_info *info, void *data)
 {
-	struct package_placement place;
+	struct obj_placement place;
 	struct numa_node *n = data;
+	struct common_obj_data *asign;
 
 	if (!info->moved)
 		return;
@@ -229,17 +142,19 @@ static void place_irq_in_package(struct irq_info *info, void *data)
 	if (info->level == BALANCE_NONE)
 		return;
 
-	place.best_cost = INT_MAX;
-	place.best = NULL;
 	place.info = info;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.best_cost = INT_MAX;
 
-	for_each_package(n->packages, find_best_package, &place);
+	for_each_package(n->packages, find_best_object, &place);
 
-	if (place.best) {
-		migrate_irq(&n->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
-		place.best->class_count[info->class]++;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&n->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }
 
@@ -250,29 +165,10 @@ static void place_packages(struct common_obj_data *d, void *data __attribute__((
 		for_each_irq(n->common.interrupts, place_irq_in_package, n);
 }
 
-struct node_placement {
-	struct irq_info *info;
-	struct numa_node *best;
-	uint64_t best_cost;
-};
-
-static void find_best_node(struct common_obj_data *d, void *data)
-{
-	struct numa_node *n = (struct numa_node *)d;
-	struct node_placement *place = data;
-
-	/*
- 	 * Just find the least loaded node
- 	 */
-	if (n->common.workload < place->best_cost) {
-		place->best = n;
-		place->best_cost = n->common.workload;
-	}
-}
-
 static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused)))
 {
-	struct node_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;
 
 	if( info->level == BALANCE_NONE)
 		return;
@@ -284,20 +180,23 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u
  		 */
 		migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info);
 		info->assigned_obj = (struct common_obj_data *)irq_numa_node(info);
-		irq_numa_node(info)->common.workload += info->workload + 1;
+		irq_numa_node(info)->common.load += info->load + 1;
 		return;
 	}
 
 	place.best_cost = INT_MAX;
 	place.best = NULL;
+	place.least_irqs = NULL;
 	place.info = info;
 
-	for_each_numa_node(NULL, find_best_node, &place);
+	for_each_numa_node(NULL, find_best_object, &place);
 
-	if (place.best) {
-		migrate_irq(&rebalance_irq_list, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&rebalance_irq_list, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }
 
diff --git a/procinterrupts.c b/procinterrupts.c
index 20c5551..0bdef18 100644
--- a/procinterrupts.c
+++ b/procinterrupts.c
@@ -93,7 +93,8 @@ void parse_proc_interrupts(void)
 		}
 		if (cpunr != core_count) 
 			need_cpu_rescan = 1;
-		
+
+		info->last_irq_count = info->irq_count;		
 		info->irq_count = count;
 
 		/* is interrupt MSI based? */
@@ -135,7 +136,7 @@ static void compute_irq_load_share(struct common_obj_data *d, void *data __attri
 
 	for_each_irq(cpu->common.interrupts, accumulate_irq_count, &total_irq_counts);
 
-	load_slice = cpu->common.load / total_irq_counts;
+	load_slice = total_irq_counts ? (cpu->common.load / total_irq_counts) : 1;
 
 	for_each_irq(cpu->common.interrupts, assign_load_slice, &load_slice);
 }
diff --git a/types.h b/types.h
index f380d04..3c13759 100644
--- a/types.h
+++ b/types.h
@@ -28,7 +28,6 @@
 
 
 struct common_obj_data {
-	uint64_t workload;
 	uint64_t load;
 	int number;
 	cpumask_t mask;
@@ -43,7 +42,6 @@ struct numa_node {
 struct package {
 	struct common_obj_data common;
 	struct numa_node *numa_node;
-	int class_count[7];
 	GList	*cache_domains;
 };
 
@@ -51,7 +49,6 @@ struct cache_domain {
 	struct common_obj_data common;
 	int marker;
 	struct package *package;
-	int class_count[7];
 	GList	*cpu_cores;
 };
 
@@ -60,7 +57,6 @@ struct cpu_core {
 	struct common_obj_data common;
 	int	marker;
 	struct cache_domain *cache_domain;
-	int class_count[7];
 	uint64_t irq_load;
 	uint64_t softirq_load;
 };
@@ -75,7 +71,6 @@ struct irq_info {
         cpumask_t affinity_hint;
         uint64_t irq_count;
         uint64_t last_irq_count;
-	uint64_t workload;
 	uint64_t load;
         int moved;
         struct common_obj_data *assigned_obj;