Cut over to base irq placement using new algorithm

This is the big move. The main loop now uses the new balance alg based on standard deviation away from the average softirq+irq time as read from /proc/stat. Initial results look good. Also cleaned out old data from previous algorithm, so we don't have any dangling mess
2011-10-06 11:11:58 -04:00 · 2011-10-06 11:11:58 -04:00 · 93f959c9a6
parent 3953fec6e4
commit 93f959c9a6
9 changed files with 90 additions and 234 deletions
--- a/classify.c
+++ b/classify.c
@ -336,9 +336,9 @@ static gint sort_irqs(gconstpointer A, gconstpointer B)
 		return 1;
 	if (a->class > b->class)
 		return -1;
-	if (a->workload < b->workload)
+	if (a->load < b->load)
 		return 1;
-	if (a->workload > b->workload)
+	if (a->load > b->load)
 		return -1;
 	if (a<b)
 		return 1;
--- a/cputree.c
+++ b/cputree.c
@ -255,13 +255,13 @@ static void dump_irq(struct irq_info *info, void *data)
 	int spaces = (long int)data;
 	int i;
 	for (i=0; i<spaces; i++) printf(" ");
-	printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->workload);
+	printf("Interrupt %i node_num is %d (%s/%u) \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned int)info->load);
 }

 static void dump_cpu_core(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cpu_core *c = (struct cpu_core *)d;
-	printf("                CPU number %i  numa_node is %d (workload %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.workload);
+	printf("                CPU number %i  numa_node is %d (load %lu)\n", c->common.number, cpu_numa_node(c)->common.number , (unsigned long)c->common.load);
 	if (c->common.interrupts)
 		for_each_irq(c->common.interrupts, dump_irq, (void *)18);
 }
@ -271,7 +271,7 @@ static void dump_cache_domain(struct common_obj_data *d, void *data)
 	struct cache_domain *c = (struct cache_domain *)d;
 	char *buffer = data;
 	cpumask_scnprintf(buffer, 4095, c->common.mask);
-	printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (workload %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.workload);
+	printf("        Cache domain %i:  numa_node is %d cpu mask is %s  (load %lu) \n", c->common.number, cache_domain_numa_node(c)->common.number, buffer, (unsigned long)c->common.load);
 	if (c->cpu_cores)
 		for_each_cpu_core(c->cpu_cores, dump_cpu_core, NULL);
 	if (c->common.interrupts)
@ -283,7 +283,7 @@ static void dump_package(struct common_obj_data *d, void *data)
 	struct package *p = (struct package *)d;
 	char *buffer = data;
 	cpumask_scnprintf(buffer, 4096, p->common.mask);
-	printf("Package %i:  numa_node is %d cpu mask is %s (workload %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.workload);
+	printf("Package %i:  numa_node is %d cpu mask is %s (load %lu)\n", p->common.number, package_numa_node(p)->common.number, buffer, (unsigned long)p->common.load);
 	if (p->cache_domains)
 		for_each_cache_domain(p->cache_domains, dump_cache_domain, buffer);
 	if (p->common.interrupts)
@ -299,8 +299,6 @@ void dump_tree(void)
 static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cpu_core *c = (struct cpu_core *)d;
-	memset(c->class_count, 0, sizeof(c->class_count));
-	c->common.workload = 0;
 	c->common.load = 0;
 	c->irq_load = 0;
 	c->softirq_load = 0;
@ -309,8 +307,6 @@ static void clear_cpu_stats(struct common_obj_data *d, void *data __attribute__(
 static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct cache_domain *c = (struct cache_domain *)d;
-	memset(c->class_count, 0, sizeof(c->class_count));
-	c->common.workload = 0;
 	c->common.load = 0;
 	for_each_cpu_core(c->cpu_cores, clear_cpu_stats, NULL);
 }
@ -318,8 +314,6 @@ static void clear_cd_stats(struct common_obj_data *d, void *data __attribute__((
 static void clear_package_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct package *p = (struct package *)d;
-	memset(p->class_count, 0, sizeof(p->class_count));
-	p->common.workload = 0;
 	p->common.load = 0;
 	for_each_cache_domain(p->cache_domains, clear_cd_stats, NULL);
 }
@ -327,14 +321,12 @@ static void clear_package_stats(struct common_obj_data *d, void *data __attribut
 static void clear_node_stats(struct common_obj_data *d, void *data __attribute__((unused)))
 {
 	struct numa_node *n = (struct numa_node *)d;
-	n->common.workload = 0;
 	n->common.load = 0;
 	for_each_package(n->packages, clear_package_stats, NULL);
 }

 static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused)))
 {
-	info->workload = 0;
 	info->load = 0;
 }

--- a/irqbalance.c
+++ b/irqbalance.c
@ -136,6 +136,7 @@ static void force_rebalance_irq(struct irq_info *info, void *data __attribute__(

 int main(int argc, char** argv)
 {
+	int compute_migration_status=0;

 #ifdef HAVE_GETOPT_LONG
 	parse_command_line(argc, argv);
@ -188,17 +189,7 @@ int main(int argc, char** argv)
 	capng_apply(CAPNG_SELECT_BOTH);
 #endif

-	parse_proc_interrupts();
-	parse_proc_stat();
-	sleep(SLEEP_INTERVAL/4);
-	reset_counts();
-	parse_proc_interrupts();
-	calculate_workload();
-	if (debug_mode)
-		dump_workloads();
-
 	for_each_irq(NULL, force_rebalance_irq, NULL);
-	sort_irq_list(&rebalance_irq_list);

 	while (1) {
 		sleep_approx(SLEEP_INTERVAL);
@ -223,9 +214,14 @@ int main(int argc, char** argv)
 			free_object_tree();
 			build_object_tree();
 			for_each_irq(NULL, force_rebalance_irq, NULL);
-		}
+			compute_migration_status=0;
+		} 
+
+		if (compute_migration_status)	
+			update_migration_status();
+		else
+			compute_migration_status=1;

-		calculate_workload();

 		calculate_placement();
 		activate_mappings();
@ -236,7 +232,6 @@ int main(int argc, char** argv)
 			break;
 		counter++;

-		for_each_irq(NULL, force_rebalance_irq, NULL);
 	}
 	free_object_tree();
 	return EXIT_SUCCESS;
--- a/irqbalance.h
+++ b/irqbalance.h
@ -32,7 +32,7 @@ extern void set_msi_interrupt_numa(int number);

 extern GList *rebalance_irq_list;

-void calculate_workload(void);
+void update_migration_status(void);
 void reset_counts(void);
 void dump_workloads(void);
 void sort_irq_list(GList **list);
--- a/irqlist.c
+++ b/irqlist.c
@ -36,36 +36,6 @@



-void get_affinity_hint(struct irq_info *irq, int number)
-{
-	char buf[PATH_MAX];
-	cpumask_t tempmask;
-	char *line = NULL;
-	size_t size = 0;
-	FILE *file;
-	sprintf(buf, "/proc/irq/%i/affinity_hint", number);
-	file = fopen(buf, "r");
-	if (!file)
-		return;
-	if (getline(&line, &size, file)==0) {
-		free(line);
-		fclose(file);
-		return;
-	}
-	cpumask_parse_user(line, strlen(line), tempmask);
-	if (!__cpus_full(&tempmask, num_possible_cpus()))
-		irq->affinity_hint = tempmask;
-	fclose(file);
-	free(line);
-}
-
-void build_workload(struct irq_info *info, void *unused __attribute__((unused)))
-{
-	info->workload = info->irq_count - info->last_irq_count + info->workload/3;
-	class_counts[info->class]++;
-	info->last_irq_count = info->irq_count;
-}
-
 struct load_balance_info {
 	unsigned long long int total_load;
 	unsigned long long avg_load;
@ -98,8 +68,15 @@ static void move_candidate_irqs(struct irq_info *info, void *data)
 {
 	int *remaining_deviation = (int *)data;

+	/* Don't rebalance irqs that don't want it */
+	if (info->level == BALANCE_NONE)
+		return;
+
+	/* Don't move cpus that only have one irq, regardless of load */
 	if (g_list_length(info->assigned_obj->interrupts) <= 1)
 		return;
+
+	/* Stop rebalancing if we've estimated a full reduction of deviation */
 	if (*remaining_deviation <= 0)
 		return;

@ -155,20 +132,17 @@ static void migrate_overloaded_irqs(struct common_obj_data *obj, void *data)
 	for_each_##name(NULL, migrate_overloaded_irqs, &(info));\
 }while(0)

-void calculate_workload(void)
+void update_migration_status(void)
 {
-	int i;
 	struct load_balance_info info;

-	for (i=0; i<7; i++)
-		class_counts[i]=0;
-	for_each_irq(NULL, build_workload, NULL);
 	find_overloaded_objs(cpu_core, info);
 	find_overloaded_objs(cache_domain, info);
 	find_overloaded_objs(package, info);
 	find_overloaded_objs(numa_node, info);
 }

+
 static void reset_irq_count(struct irq_info *info, void *unused __attribute__((unused)))
 {
 	info->last_irq_count = info->irq_count;
@ -183,7 +157,7 @@ void reset_counts(void)

 static void dump_workload(struct irq_info *info, void *unused __attribute__((unused)))
 {
-	printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->workload);
+	printf("Interrupt %i node_num %d (class %s) has workload %lu \n", info->irq, irq_numa_node(info)->common.number, classes[info->class], (unsigned long)info->load);
 }

 void dump_workloads(void)
--- a/numa.c
+++ b/numa.c
@ -39,7 +39,7 @@ GList *numa_nodes = NULL;

 struct numa_node unspecified_node = {
 	.common = {
-		.workload = 0,
+		.load = 0,
 		.number = -1,
 		.mask = CPU_MASK_ALL,
 		.interrupts = NULL,
--- a/placement.c
+++ b/placement.c
@ -32,92 +32,36 @@ int power_mode;

 GList *rebalance_irq_list;

-static uint64_t package_cost_func(struct irq_info *irq, struct package *package)
-{
-	int bonus = 0;
-	int maxcount;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-	/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
-	if (power_mode)
-		bonus += POWER_MODE_PACKAGE_THRESHOLD * package->common.number;
-
-	/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
-	maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
-	if (package->class_count[irq->class]>=maxcount && !power_mode)
-		bonus += 300000;
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cache_domain_cost_func(struct irq_info *irq, struct cache_domain *cache_domain)
-{
-	int bonus = 0;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
-
-	/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
-	 * cache domain */
-	if ((irq->type == IRQ_TYPE_MSI) || (irq->type == IRQ_TYPE_MSIX))
-		bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
-
-
-	return irq->workload + bonus;
-}
-
-static uint64_t cpu_cost_func(struct irq_info *irq, struct cpu_core *cpu)
-{
-	int bonus = 0;
-
-	/* but if the irq has had 0 interrupts for a while move it about more easily */
-	if (irq->workload==0)
-		bonus = bonus / 10;
-
-	/* 
-	 * since some chipsets only place at the first cpu, give a tiny preference to non-first
-	 * cpus for specifically placed interrupts 
-	 */
-	if (first_cpu(cpu_cache_domain(cpu)->common.mask)==cpu->common.number)
-		bonus++;
-
-	/* pay 6000 for each previous interrupt of the same class */
-	bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
-
-	return irq->workload + bonus;
-}
-
-struct cache_domain_placement {
-	struct irq_info *info;
-	struct cache_domain *best;
-	uint64_t best_cost;
+struct obj_placement {
+		struct common_obj_data *best;
+		struct common_obj_data *least_irqs;
+		uint64_t best_cost;
+		struct irq_info *info;
 };

-static void find_best_cd(struct common_obj_data *d, void *data)
+static void find_best_object(struct common_obj_data *d, void *data)
 {
-	struct cache_domain *c = (struct cache_domain *)d;
-	struct cache_domain_placement *best = data;
+	struct obj_placement *best = (struct obj_placement *)data;
 	uint64_t newload;

-	newload = c->common.workload + cache_domain_cost_func(best->info, c);
+	newload = d->load;
 	if (newload < best->best_cost) {
-		best->best = c;
+		best->best = d;
 		best->best_cost = newload;
+		best->least_irqs = NULL;
 	}
-}	
+
+	if (newload == best->best_cost) {
+		if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts))
+			best->least_irqs = d;
+	}
+}

 static void place_irq_in_cache_domain(struct irq_info *info, void *data)
 {
 	struct package *p = data;
-	struct cache_domain_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;

 	if (!info->moved)
 		return;
@ -125,16 +69,19 @@ static void place_irq_in_cache_domain(struct irq_info *info, void *data)
 	if (info->level <= BALANCE_PACKAGE)
 		return;

-	place.best_cost = INT_MAX;
-	place.best = NULL;
+
 	place.info = info;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.best_cost = INT_MAX;

-	for_each_cache_domain(p->cache_domains, find_best_cd, &place);
+	for_each_cache_domain(p->cache_domains, find_best_object, &place);

-	if (place.best) {
-		migrate_irq(&p->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->class_count[info->class]++;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&p->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
 	}

 }
@ -146,30 +93,11 @@ static void place_cache_domain(struct common_obj_data *d, void *data __attribute
 		for_each_irq(package->common.interrupts, place_irq_in_cache_domain, package);
 }

-
-struct core_placement {
-	struct cpu_core *best;
-	uint64_t best_cost;
-	struct irq_info *info;
-};
-
-static void place_irq_in_core(struct common_obj_data *d, void *data)
-{
-	struct cpu_core *c = (struct cpu_core *)d;
-	struct core_placement *best = data;
-	uint64_t newload;
-
-	newload = c->common.workload + cpu_cost_func(best->info, c);
-	if (newload < best->best_cost) {
-		best->best = c;
-		best->best_cost = newload;
-	}
-}
-
 static void place_core(struct irq_info *info, void *data)
 {
 	struct cache_domain *c = data;
-	struct core_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;

 	if (!info->moved)
 		return;
@ -180,14 +108,17 @@ static void place_core(struct irq_info *info, void *data)

 	place.info = info;
 	place.best = NULL;
+	place.least_irqs = NULL;
 	place.best_cost = INT_MAX;

-	for_each_cpu_core(c->cpu_cores, place_irq_in_core, &place);
+	for_each_cpu_core(c->cpu_cores, find_best_object, &place);

-	if (place.best) {
-		migrate_irq(&c->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&c->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}

 }
@ -199,29 +130,11 @@ static void place_cores(struct common_obj_data *d, void *data __attribute__((unu
 		for_each_irq(cache_domain->common.interrupts, place_core, cache_domain);
 }

-struct package_placement {
-	struct irq_info *info;
-	struct package *best;
-	uint64_t best_cost;
-};
-
-static void find_best_package(struct common_obj_data *d, void *data)
-{
-	struct package *p = (struct package *)d;
-	uint64_t newload;
-	struct package_placement *place = data;
-
-	newload = p->common.workload + package_cost_func(place->info, p);
-	if (newload < place->best_cost) {
-		place->best = p;
-		place->best_cost = newload;
-	}
-}
-
 static void place_irq_in_package(struct irq_info *info, void *data)
 {
-	struct package_placement place;
+	struct obj_placement place;
 	struct numa_node *n = data;
+	struct common_obj_data *asign;

 	if (!info->moved)
 		return;
@ -229,17 +142,19 @@ static void place_irq_in_package(struct irq_info *info, void *data)
 	if (info->level == BALANCE_NONE)
 		return;

-	place.best_cost = INT_MAX;
-	place.best = NULL;
 	place.info = info;
+	place.best = NULL;
+	place.least_irqs = NULL;
+	place.best_cost = INT_MAX;

-	for_each_package(n->packages, find_best_package, &place);
+	for_each_package(n->packages, find_best_object, &place);

-	if (place.best) {
-		migrate_irq(&n->common.interrupts, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
-		place.best->class_count[info->class]++;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&n->common.interrupts, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }

@ -250,29 +165,10 @@ static void place_packages(struct common_obj_data *d, void *data __attribute__((
 		for_each_irq(n->common.interrupts, place_irq_in_package, n);
 }

-struct node_placement {
-	struct irq_info *info;
-	struct numa_node *best;
-	uint64_t best_cost;
-};
-
-static void find_best_node(struct common_obj_data *d, void *data)
-{
-	struct numa_node *n = (struct numa_node *)d;
-	struct node_placement *place = data;
-
-	/*
- 	 * Just find the least loaded node
- 	 */
-	if (n->common.workload < place->best_cost) {
-		place->best = n;
-		place->best_cost = n->common.workload;
-	}
-}
-
 static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused)))
 {
-	struct node_placement place;
+	struct obj_placement place;
+	struct common_obj_data *asign;

 	if( info->level == BALANCE_NONE)
 		return;
@ -284,20 +180,23 @@ static void place_irq_in_node(struct irq_info *info, void *data __attribute__((u
 		 */
 		migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->common.interrupts, info);
 		info->assigned_obj = (struct common_obj_data *)irq_numa_node(info);
-		irq_numa_node(info)->common.workload += info->workload + 1;
+		irq_numa_node(info)->common.load += info->load + 1;
 		return;
 	}

 	place.best_cost = INT_MAX;
 	place.best = NULL;
+	place.least_irqs = NULL;
 	place.info = info;

-	for_each_numa_node(NULL, find_best_node, &place);
+	for_each_numa_node(NULL, find_best_object, &place);

-	if (place.best) {
-		migrate_irq(&rebalance_irq_list, &place.best->common.interrupts, info);
-		info->assigned_obj = (struct common_obj_data *)place.best;
-		place.best->common.workload += info->workload + 1;
+	asign = place.least_irqs ? place.least_irqs : place.best;
+
+	if (asign) {
+		migrate_irq(&rebalance_irq_list, &asign->interrupts, info);
+		info->assigned_obj = asign;
+		asign->load += info->load;
 	}
 }

--- a/procinterrupts.c
+++ b/procinterrupts.c
@ -93,7 +93,8 @@ void parse_proc_interrupts(void)
 		}
 		if (cpunr != core_count) 
 			need_cpu_rescan = 1;
-		
+
+		info->last_irq_count = info->irq_count;		
 		info->irq_count = count;

 		/* is interrupt MSI based? */
@ -135,7 +136,7 @@ static void compute_irq_load_share(struct common_obj_data *d, void *data __attri

 	for_each_irq(cpu->common.interrupts, accumulate_irq_count, &total_irq_counts);

-	load_slice = cpu->common.load / total_irq_counts;
+	load_slice = total_irq_counts ? (cpu->common.load / total_irq_counts) : 1;

 	for_each_irq(cpu->common.interrupts, assign_load_slice, &load_slice);
 }
--- a/types.h
+++ b/types.h
@ -28,7 +28,6 @@


 struct common_obj_data {
-	uint64_t workload;
 	uint64_t load;
 	int number;
 	cpumask_t mask;
@ -43,7 +42,6 @@ struct numa_node {
 struct package {
 	struct common_obj_data common;
 	struct numa_node *numa_node;
-	int class_count[7];
 	GList	*cache_domains;
 };

@ -51,7 +49,6 @@ struct cache_domain {
 	struct common_obj_data common;
 	int marker;
 	struct package *package;
-	int class_count[7];
 	GList	*cpu_cores;
 };

@ -60,7 +57,6 @@ struct cpu_core {
 	struct common_obj_data common;
 	int	marker;
 	struct cache_domain *cache_domain;
-	int class_count[7];
 	uint64_t irq_load;
 	uint64_t softirq_load;
 };
@ -75,7 +71,6 @@ struct irq_info {
        cpumask_t affinity_hint;
        uint64_t irq_count;
        uint64_t last_irq_count;
-	uint64_t workload;
 	uint64_t load;
        int moved;
        struct common_obj_data *assigned_obj;