From 565db19e3d1faebbaba8990c5a404effeadd734f Mon Sep 17 00:00:00 2001 From: nhorman Date: Mon, 1 Aug 2011 18:25:02 +0000 Subject: [PATCH] Enhance irqbalance logic to condier PCI bus topology in IRQ mapping This patch enhances irqbalance so that, it considers the proximity of devices to cpus when making irq mapping decisions. In large numa systems this will cause irqs to be biased such that they have unique affinity for cpus on the same numa node as the device triggering them. Resolves: http://code.google.com/p/irqbalance/issues/detail?id=17 Thanks to Petr Holasek for authoring the patch git-svn-id: https://irqbalance.googlecode.com/svn/trunk@34 46b42954-3823-0410-bd82-eb80b452c9b5 --- Makefile.am | 2 +- constants.h | 3 ++- cputree.c | 46 +++++++++++++++++++++++++++++++++---- irqbalance.c | 12 ++++++++-- irqbalance.h | 6 +++-- irqlist.c | 31 +++++++++++++++++++++++-- network.c | 60 ++++++++++++++++++++++++++++++++++++++++-------- numa.c | 12 +++++++++- placement.c | 37 ++++++++++++++++++++++++----- procinterrupts.c | 16 +++++++++++++ types.h | 5 ++++ 11 files changed, 201 insertions(+), 29 deletions(-) diff --git a/Makefile.am b/Makefile.am index def5130..771f043 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4 INCLUDES = -I${top_srcdir} -LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) +LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \ types.h diff --git a/constants.h b/constants.h index 1439e0c..fe881bd 100644 --- a/constants.h +++ b/constants.h @@ -16,9 +16,10 @@ /* balancing tunings */ #define CROSS_PACKAGE_PENALTY 3000 -#define NUMA_PENALTY 250 +#define NUMA_PENALTY 500 #define POWER_MODE_PACKAGE_THRESHOLD 20000 #define CLASS_VIOLATION_PENTALTY 6000 +#define MSI_CACHE_PENALTY 10000 #define CORE_SPECIFIC_THRESHOLD 5000 /* power mode */ diff --git a/cputree.c b/cputree.c index b3ae8cd..f9e619e 100644 --- a/cputree.c +++ b/cputree.c @@ -55,6 +55,39 @@ cpumask_t cpu_possible_map; */ static cpumask_t unbanned_cpus; +static int search_numa_node(cpumask_t mask) +{ + int node_num, ret; + struct bitmask *node_mask; + cpumask_t cpu_node_mask; + + node_num = numa_num_configured_nodes(); + + if (node_num < 1) + return -1; + + node_mask = numa_allocate_cpumask(); + + node_num--; /* indexing from zero */ + + while (node_num >= 0) { + ret = numa_node_to_cpus(node_num, node_mask); + if (ret) { + node_num--; + continue; + } + memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long)); + if (cpus_intersects(mask, cpu_node_mask)) { + numa_free_cpumask(node_mask); + return node_num; + } + node_num--; + } + + numa_free_cpumask(node_mask); + return node_num; +} + static void fill_packages(void) { GList *entry; @@ -76,6 +109,7 @@ static void fill_packages(void) memset(package, 0, sizeof(struct package)); package->mask = cache->package_mask; package->number = cache->number; + package->node_num = search_numa_node(package->mask); while (entry2) { struct cache_domain *cache2; cache2 = entry2->data; @@ -113,6 +147,7 @@ static void fill_cache_domain(void) cache->mask = cpu->cache_mask; cache->package_mask = cpu->package_mask; cache->number = cpu->number; + cache->node_num = search_numa_node(cache->mask); cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; while (entry2) { @@ -164,6 +199,9 @@ static void do_one_cpu(char *path) cpu_set(cpu->number, cpu->mask); + /* set numa node of cpu */ + cpu->node_num = search_numa_node(cpu->mask); + /* if the cpu is on the banned list, just don't add it */ if (cpus_intersects(cpu->mask, banned_cpus)) { free(cpu); @@ -229,7 +267,7 @@ static void dump_irqs(int spaces, GList *dump_interrupts) int i; for (i=0; idata; - printf("Interrupt %i (%s/%u) \n", irq->number, classes[irq->class], (unsigned int)irq->workload); + printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload); dump_interrupts = g_list_next(dump_interrupts); } } @@ -246,18 +284,18 @@ void dump_tree(void) while (p_iter) { package = p_iter->data; cpumask_scnprintf(buffer, 4096, package->mask); - printf("Package %i: cpu mask is %s (workload %lu)\n", package->number, buffer, (unsigned long)package->workload); + printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload); c_iter = g_list_first(package->cache_domains); while (c_iter) { cache_domain = c_iter->data; c_iter = g_list_next(c_iter); cpumask_scnprintf(buffer, 4095, cache_domain->mask); - printf(" Cache domain %i: cpu mask is %s (workload %lu) \n", cache_domain->number, buffer, (unsigned long)cache_domain->workload); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload); cp_iter = cache_domain->cpu_cores; while (cp_iter) { cpu = cp_iter->data; cp_iter = g_list_next(cp_iter); - printf(" CPU number %i (workload %lu)\n", cpu->number, (unsigned long)cpu->workload); + printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload); dump_irqs(18, cpu->interrupts); } dump_irqs(10, cache_domain->interrupts); diff --git a/irqbalance.c b/irqbalance.c index 5bce9a4..9809944 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -31,6 +31,7 @@ int one_shot_mode; int debug_mode; +int numa_avail; int need_cpu_rescan; @@ -70,6 +71,14 @@ int main(int argc, char** argv) if (getenv("IRQBALANCE_DEBUG")) debug_mode=1; + if (numa_available() > -1) { + numa_avail = 1; + } else { + if (debug_mode) + printf("This machine seems not NUMA capable.\n"); + } + + parse_cpu_tree(); @@ -131,8 +140,7 @@ int main(int argc, char** argv) /* to cope with dynamic configurations we scan for new numa information * once every 5 minutes */ - if (counter % NUMA_REFRESH_INTERVAL == 16) - pci_numa_scan(); + pci_numa_scan(); calculate_placement(); activate_mapping(); diff --git a/irqbalance.h b/irqbalance.h index eafe13e..86f62e6 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -10,6 +10,7 @@ #include #include "types.h" +#include struct interrupt; @@ -25,14 +26,15 @@ extern int need_cpu_rescan; extern int one_shot_mode; extern GList *interrupts; - extern void parse_cpu_tree(void); extern void clear_work_stats(void); extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); +extern void set_msi_interrupt_numa(int number, char *devname); extern void add_interrupt_count(int number, uint64_t count, int type); extern int find_class(struct interrupt *irq, char *string); -extern void add_interrupt_numa(int number, cpumask_t mask, int type); +extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); +int dev_to_node(char *devname); void calculate_workload(void); void reset_counts(void); diff --git a/irqlist.c b/irqlist.c index 0997873..cef465b 100644 --- a/irqlist.c +++ b/irqlist.c @@ -146,6 +146,31 @@ static void investigate(struct interrupt *irq, int number) } while (c!=c2 && c2!=NULL); } +/* Set numa node number for MSI interrupt; + * Assumes existing irq metadata + */ +void set_msi_interrupt_numa(int number, char *devname) +{ + GList *item; + struct interrupt *irq; + int node; + + node = dev_to_node(devname); + if (node < 0) + return; + + item = g_list_first(interrupts); + while (item) { + irq = item->data; + + if (irq->number == number) { + irq->node_num = node; + irq->msi = 1; + return; + } + item = g_list_next(item); + } +} /* * Set the number of interrupts received for a specific irq; @@ -177,6 +202,7 @@ void set_interrupt_count(int number, uint64_t count) if (!irq) return; memset(irq, 0, sizeof(struct interrupt)); + irq->node_num = -1; irq->number = number; irq->count = count; irq->allowed_mask = CPU_MASK_ALL; @@ -217,7 +243,7 @@ void add_interrupt_count(int number, uint64_t count, int type) * is metadata for the interrupt; do nothing if no such data * exists. */ -void add_interrupt_numa(int number, cpumask_t mask, int type) +void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type) { GList *item; struct interrupt *irq; @@ -229,6 +255,7 @@ void add_interrupt_numa(int number, cpumask_t mask, int type) if (irq->number == number) { cpus_or(irq->numa_mask, irq->numa_mask, mask); + irq->node_num = node_num; if (irq->class < type && irq->balance_level != BALANCE_NONE) { irq->class = type; irq->balance_level = map_class_to_level[irq->class]; @@ -281,7 +308,7 @@ void dump_workloads(void) irq = item->data; item = g_list_next(item); - printf("Interrupt %i (class %s) has workload %lu \n", irq->number, classes[irq->class], (unsigned long)irq->workload); + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload); } } diff --git a/network.c b/network.c index ba25af9..da48e3a 100644 --- a/network.c +++ b/network.c @@ -59,26 +59,19 @@ struct nic { static GList *nics; - -static int dev_to_irq(char *devname) +static int dev_to_bus(char *devname, char *busname) { int sock, ret; struct ifreq ifr; struct ethtool_value ethtool; struct ethtool_drvinfo driver; - FILE *file; - char *line = NULL; - size_t size; - int val; - - char buffer[PATH_MAX]; memset(&ifr, 0, sizeof(struct ifreq)); memset(ðtool, 0, sizeof(struct ethtool_value)); sock = socket(AF_INET, SOCK_DGRAM, 0); if (sock<0) - return 0; + return -1; strcpy(ifr.ifr_name, devname); @@ -87,8 +80,24 @@ static int dev_to_irq(char *devname) ret = ioctl(sock, SIOCETHTOOL, &ifr); close(sock); if (ret<0) + return -1; + strncpy(busname,driver.bus_info,63); + return 0; +} + +static int dev_to_irq(char *devname) +{ + FILE *file; + char *line = NULL; + size_t size; + int val; + char busname[64]; + + char buffer[PATH_MAX]; + + if (dev_to_bus(devname, busname)) return 0; - sprintf(buffer,"/sys/bus/pci/devices/%s/irq", driver.bus_info); + sprintf(buffer,"/sys/bus/pci/devices/%s/irq", busname); file = fopen(buffer, "r"); if (!file) return 0; @@ -105,6 +114,37 @@ static int dev_to_irq(char *devname) return val; } +int dev_to_node(char *devname) +{ + int node, ret; + char *line = NULL; + FILE *file; + size_t size; + + char busname[64]; + char buffer[PATH_MAX]; + + ret = dev_to_bus(devname, busname); + if (ret) + return -1; + + sprintf(buffer,"/sys/bus/pci/devices/%s/numa_node", busname); + file = fopen(buffer, "r"); + if (!file) + return -1; + if (getline(&line, &size, file)==0) { + free(line); + fclose(file); + return -1; + } + fclose(file); + node = 0; + if (line) + node = strtoul(line, NULL, 10); + free(line); + return node; +} + static struct nic *new_nic(char *name) { struct nic *nic; diff --git a/numa.c b/numa.c index 453a9bc..6bd326e 100644 --- a/numa.c +++ b/numa.c @@ -41,6 +41,7 @@ void pci_numa_scan(void) char line[PATH_MAX]; FILE *file; int irq; + int node_num; unsigned int class; dir = opendir("/sys/bus/pci/devices"); @@ -83,6 +84,15 @@ void pci_numa_scan(void) fclose(file); cpumask_parse_user(line, strlen(line), mask); + /* Add numa_node file support */ + sprintf(line,"/sys/bus/pci/devices/%s/numa_node", entry->d_name); + file = fopen(line, "r"); + if (!file) + continue; + if (fgets(line, PATH_MAX, file)==NULL) + line[0]=0; + node_num = strtol(line, NULL, 10); + type = IRQ_OTHER; if ((class>>16) == 0x01) type = IRQ_SCSI; @@ -95,7 +105,7 @@ void pci_numa_scan(void) if ((class>>16) >= 0x03 && (class>>16) <= 0x0C) type = IRQ_LEGACY; - add_interrupt_numa(irq, mask, type); + add_interrupt_numa(irq, mask, node_num, type); } while (entry); closedir(dir); diff --git a/placement.c b/placement.c index ff5809d..828ce8e 100644 --- a/placement.c +++ b/placement.c @@ -36,13 +36,19 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package { int bonus = 0; int maxcount; + int dist; /* moving to a cold package/cache/etc gets you a 3000 penalty */ if (!cpus_intersects(irq->old_mask, package->mask)) bonus = CROSS_PACKAGE_PENALTY; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, package->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != package->node_num) { + if (irq->node_num >= 0 && package->node_num >= 0) { + dist = numa_distance(irq->node_num, package->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) @@ -67,13 +73,20 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain) { int bonus = 0; + int dist; + /* moving to a cold cache gets you a 1500 penalty */ if (!cpus_intersects(irq->old_mask, cache_domain->mask)) bonus = CROSS_PACKAGE_PENALTY/2; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, cache_domain->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != cache_domain->node_num) { + if (irq->node_num >= 0 && cache_domain->node_num >= 0) { + dist = numa_distance(irq->node_num, cache_domain->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) @@ -83,6 +96,11 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai /* pay 6000 for each previous interrupt of the same class */ bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class]; + /* try to avoid having a lot of MSI interrupt (globally, no by devide id) on + * cache domain */ + if (irq->msi == 1) + bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class]; + /* if the cache domain has no cpus in the allowed mask.. just block */ if (!cpus_intersects(irq->allowed_mask, cache_domain->mask)) bonus += 600000; @@ -93,13 +111,20 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) { int bonus = 0; + int dist; + /* moving to a colder core gets you a 1000 penalty */ if (!cpus_intersects(irq->old_mask, cpu->mask)) bonus = CROSS_PACKAGE_PENALTY/3; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, cpu->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != cpu->node_num) { + if (irq->node_num >= 0 && cpu->node_num >= 0) { + dist = numa_distance(irq->node_num, cpu->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) diff --git a/procinterrupts.c b/procinterrupts.c index e336efe..67da9a8 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -82,6 +82,22 @@ void parse_proc_interrupts(void) need_cpu_rescan = 1; set_interrupt_count(number, count); + + /* is interrupt MSI based? */ + while (*c && *c == ' ') + c++; + if (strstr(c, "PCI-MSI") != NULL) { + while (*c && *c != ' ') + c++; + while (*c && *c == ' ') + c++; + if (c) { + /* Set numa node for irq if it was MSI */ + if (debug_mode) + printf("Set MSI interrupt for %d\n", number); + set_msi_interrupt_numa(number, c); + } + } } fclose(file); free(line); diff --git a/types.h b/types.h index b986fe6..d60bb46 100644 --- a/types.h +++ b/types.h @@ -24,6 +24,7 @@ struct package { int number; cpumask_t mask; + int node_num; int class_count[7]; @@ -36,6 +37,7 @@ struct cache_domain { int number; int marker; + int node_num; cpumask_t mask; @@ -53,6 +55,7 @@ struct cpu_core { int number; int marker; + int node_num; int class_count[7]; @@ -70,6 +73,8 @@ struct interrupt { int number; int class; + int node_num; + int msi; uint64_t count; uint64_t old_count;