diff --git a/Makefile.am b/Makefile.am index def5130..771f043 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4 INCLUDES = -I${top_srcdir} -LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) +LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \ types.h diff --git a/constants.h b/constants.h index 1439e0c..fe881bd 100644 --- a/constants.h +++ b/constants.h @@ -16,9 +16,10 @@ /* balancing tunings */ #define CROSS_PACKAGE_PENALTY 3000 -#define NUMA_PENALTY 250 +#define NUMA_PENALTY 500 #define POWER_MODE_PACKAGE_THRESHOLD 20000 #define CLASS_VIOLATION_PENTALTY 6000 +#define MSI_CACHE_PENALTY 10000 #define CORE_SPECIFIC_THRESHOLD 5000 /* power mode */ diff --git a/cputree.c b/cputree.c index b3ae8cd..f9e619e 100644 --- a/cputree.c +++ b/cputree.c @@ -55,6 +55,39 @@ cpumask_t cpu_possible_map; */ static cpumask_t unbanned_cpus; +static int search_numa_node(cpumask_t mask) +{ + int node_num, ret; + struct bitmask *node_mask; + cpumask_t cpu_node_mask; + + node_num = numa_num_configured_nodes(); + + if (node_num < 1) + return -1; + + node_mask = numa_allocate_cpumask(); + + node_num--; /* indexing from zero */ + + while (node_num >= 0) { + ret = numa_node_to_cpus(node_num, node_mask); + if (ret) { + node_num--; + continue; + } + memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long)); + if (cpus_intersects(mask, cpu_node_mask)) { + numa_free_cpumask(node_mask); + return node_num; + } + node_num--; + } + + numa_free_cpumask(node_mask); + return node_num; +} + static void fill_packages(void) { GList *entry; @@ -76,6 +109,7 @@ static void fill_packages(void) memset(package, 0, sizeof(struct package)); package->mask = cache->package_mask; package->number = cache->number; + package->node_num = search_numa_node(package->mask); while (entry2) { struct cache_domain *cache2; cache2 = entry2->data; @@ -113,6 +147,7 @@ static void fill_cache_domain(void) cache->mask = cpu->cache_mask; cache->package_mask = cpu->package_mask; cache->number = cpu->number; + cache->node_num = search_numa_node(cache->mask); cache_domains = g_list_append(cache_domains, cache); cache_domain_count++; while (entry2) { @@ -164,6 +199,9 @@ static void do_one_cpu(char *path) cpu_set(cpu->number, cpu->mask); + /* set numa node of cpu */ + cpu->node_num = search_numa_node(cpu->mask); + /* if the cpu is on the banned list, just don't add it */ if (cpus_intersects(cpu->mask, banned_cpus)) { free(cpu); @@ -229,7 +267,7 @@ static void dump_irqs(int spaces, GList *dump_interrupts) int i; for (i=0; idata; - printf("Interrupt %i (%s/%u) \n", irq->number, classes[irq->class], (unsigned int)irq->workload); + printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload); dump_interrupts = g_list_next(dump_interrupts); } } @@ -246,18 +284,18 @@ void dump_tree(void) while (p_iter) { package = p_iter->data; cpumask_scnprintf(buffer, 4096, package->mask); - printf("Package %i: cpu mask is %s (workload %lu)\n", package->number, buffer, (unsigned long)package->workload); + printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload); c_iter = g_list_first(package->cache_domains); while (c_iter) { cache_domain = c_iter->data; c_iter = g_list_next(c_iter); cpumask_scnprintf(buffer, 4095, cache_domain->mask); - printf(" Cache domain %i: cpu mask is %s (workload %lu) \n", cache_domain->number, buffer, (unsigned long)cache_domain->workload); + printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload); cp_iter = cache_domain->cpu_cores; while (cp_iter) { cpu = cp_iter->data; cp_iter = g_list_next(cp_iter); - printf(" CPU number %i (workload %lu)\n", cpu->number, (unsigned long)cpu->workload); + printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload); dump_irqs(18, cpu->interrupts); } dump_irqs(10, cache_domain->interrupts); diff --git a/irqbalance.c b/irqbalance.c index 5bce9a4..9809944 100644 --- a/irqbalance.c +++ b/irqbalance.c @@ -31,6 +31,7 @@ int one_shot_mode; int debug_mode; +int numa_avail; int need_cpu_rescan; @@ -70,6 +71,14 @@ int main(int argc, char** argv) if (getenv("IRQBALANCE_DEBUG")) debug_mode=1; + if (numa_available() > -1) { + numa_avail = 1; + } else { + if (debug_mode) + printf("This machine seems not NUMA capable.\n"); + } + + parse_cpu_tree(); @@ -131,8 +140,7 @@ int main(int argc, char** argv) /* to cope with dynamic configurations we scan for new numa information * once every 5 minutes */ - if (counter % NUMA_REFRESH_INTERVAL == 16) - pci_numa_scan(); + pci_numa_scan(); calculate_placement(); activate_mapping(); diff --git a/irqbalance.h b/irqbalance.h index eafe13e..86f62e6 100644 --- a/irqbalance.h +++ b/irqbalance.h @@ -10,6 +10,7 @@ #include #include "types.h" +#include struct interrupt; @@ -25,14 +26,15 @@ extern int need_cpu_rescan; extern int one_shot_mode; extern GList *interrupts; - extern void parse_cpu_tree(void); extern void clear_work_stats(void); extern void parse_proc_interrupts(void); extern void set_interrupt_count(int number, uint64_t count); +extern void set_msi_interrupt_numa(int number, char *devname); extern void add_interrupt_count(int number, uint64_t count, int type); extern int find_class(struct interrupt *irq, char *string); -extern void add_interrupt_numa(int number, cpumask_t mask, int type); +extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type); +int dev_to_node(char *devname); void calculate_workload(void); void reset_counts(void); diff --git a/irqlist.c b/irqlist.c index 0997873..cef465b 100644 --- a/irqlist.c +++ b/irqlist.c @@ -146,6 +146,31 @@ static void investigate(struct interrupt *irq, int number) } while (c!=c2 && c2!=NULL); } +/* Set numa node number for MSI interrupt; + * Assumes existing irq metadata + */ +void set_msi_interrupt_numa(int number, char *devname) +{ + GList *item; + struct interrupt *irq; + int node; + + node = dev_to_node(devname); + if (node < 0) + return; + + item = g_list_first(interrupts); + while (item) { + irq = item->data; + + if (irq->number == number) { + irq->node_num = node; + irq->msi = 1; + return; + } + item = g_list_next(item); + } +} /* * Set the number of interrupts received for a specific irq; @@ -177,6 +202,7 @@ void set_interrupt_count(int number, uint64_t count) if (!irq) return; memset(irq, 0, sizeof(struct interrupt)); + irq->node_num = -1; irq->number = number; irq->count = count; irq->allowed_mask = CPU_MASK_ALL; @@ -217,7 +243,7 @@ void add_interrupt_count(int number, uint64_t count, int type) * is metadata for the interrupt; do nothing if no such data * exists. */ -void add_interrupt_numa(int number, cpumask_t mask, int type) +void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type) { GList *item; struct interrupt *irq; @@ -229,6 +255,7 @@ void add_interrupt_numa(int number, cpumask_t mask, int type) if (irq->number == number) { cpus_or(irq->numa_mask, irq->numa_mask, mask); + irq->node_num = node_num; if (irq->class < type && irq->balance_level != BALANCE_NONE) { irq->class = type; irq->balance_level = map_class_to_level[irq->class]; @@ -281,7 +308,7 @@ void dump_workloads(void) irq = item->data; item = g_list_next(item); - printf("Interrupt %i (class %s) has workload %lu \n", irq->number, classes[irq->class], (unsigned long)irq->workload); + printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload); } } diff --git a/network.c b/network.c index ba25af9..da48e3a 100644 --- a/network.c +++ b/network.c @@ -59,26 +59,19 @@ struct nic { static GList *nics; - -static int dev_to_irq(char *devname) +static int dev_to_bus(char *devname, char *busname) { int sock, ret; struct ifreq ifr; struct ethtool_value ethtool; struct ethtool_drvinfo driver; - FILE *file; - char *line = NULL; - size_t size; - int val; - - char buffer[PATH_MAX]; memset(&ifr, 0, sizeof(struct ifreq)); memset(ðtool, 0, sizeof(struct ethtool_value)); sock = socket(AF_INET, SOCK_DGRAM, 0); if (sock<0) - return 0; + return -1; strcpy(ifr.ifr_name, devname); @@ -87,8 +80,24 @@ static int dev_to_irq(char *devname) ret = ioctl(sock, SIOCETHTOOL, &ifr); close(sock); if (ret<0) + return -1; + strncpy(busname,driver.bus_info,63); + return 0; +} + +static int dev_to_irq(char *devname) +{ + FILE *file; + char *line = NULL; + size_t size; + int val; + char busname[64]; + + char buffer[PATH_MAX]; + + if (dev_to_bus(devname, busname)) return 0; - sprintf(buffer,"/sys/bus/pci/devices/%s/irq", driver.bus_info); + sprintf(buffer,"/sys/bus/pci/devices/%s/irq", busname); file = fopen(buffer, "r"); if (!file) return 0; @@ -105,6 +114,37 @@ static int dev_to_irq(char *devname) return val; } +int dev_to_node(char *devname) +{ + int node, ret; + char *line = NULL; + FILE *file; + size_t size; + + char busname[64]; + char buffer[PATH_MAX]; + + ret = dev_to_bus(devname, busname); + if (ret) + return -1; + + sprintf(buffer,"/sys/bus/pci/devices/%s/numa_node", busname); + file = fopen(buffer, "r"); + if (!file) + return -1; + if (getline(&line, &size, file)==0) { + free(line); + fclose(file); + return -1; + } + fclose(file); + node = 0; + if (line) + node = strtoul(line, NULL, 10); + free(line); + return node; +} + static struct nic *new_nic(char *name) { struct nic *nic; diff --git a/numa.c b/numa.c index 453a9bc..6bd326e 100644 --- a/numa.c +++ b/numa.c @@ -41,6 +41,7 @@ void pci_numa_scan(void) char line[PATH_MAX]; FILE *file; int irq; + int node_num; unsigned int class; dir = opendir("/sys/bus/pci/devices"); @@ -83,6 +84,15 @@ void pci_numa_scan(void) fclose(file); cpumask_parse_user(line, strlen(line), mask); + /* Add numa_node file support */ + sprintf(line,"/sys/bus/pci/devices/%s/numa_node", entry->d_name); + file = fopen(line, "r"); + if (!file) + continue; + if (fgets(line, PATH_MAX, file)==NULL) + line[0]=0; + node_num = strtol(line, NULL, 10); + type = IRQ_OTHER; if ((class>>16) == 0x01) type = IRQ_SCSI; @@ -95,7 +105,7 @@ void pci_numa_scan(void) if ((class>>16) >= 0x03 && (class>>16) <= 0x0C) type = IRQ_LEGACY; - add_interrupt_numa(irq, mask, type); + add_interrupt_numa(irq, mask, node_num, type); } while (entry); closedir(dir); diff --git a/placement.c b/placement.c index ff5809d..828ce8e 100644 --- a/placement.c +++ b/placement.c @@ -36,13 +36,19 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package { int bonus = 0; int maxcount; + int dist; /* moving to a cold package/cache/etc gets you a 3000 penalty */ if (!cpus_intersects(irq->old_mask, package->mask)) bonus = CROSS_PACKAGE_PENALTY; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, package->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != package->node_num) { + if (irq->node_num >= 0 && package->node_num >= 0) { + dist = numa_distance(irq->node_num, package->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) @@ -67,13 +73,20 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain) { int bonus = 0; + int dist; + /* moving to a cold cache gets you a 1500 penalty */ if (!cpus_intersects(irq->old_mask, cache_domain->mask)) bonus = CROSS_PACKAGE_PENALTY/2; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, cache_domain->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != cache_domain->node_num) { + if (irq->node_num >= 0 && cache_domain->node_num >= 0) { + dist = numa_distance(irq->node_num, cache_domain->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) @@ -83,6 +96,11 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai /* pay 6000 for each previous interrupt of the same class */ bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class]; + /* try to avoid having a lot of MSI interrupt (globally, no by devide id) on + * cache domain */ + if (irq->msi == 1) + bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class]; + /* if the cache domain has no cpus in the allowed mask.. just block */ if (!cpus_intersects(irq->allowed_mask, cache_domain->mask)) bonus += 600000; @@ -93,13 +111,20 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu) { int bonus = 0; + int dist; + /* moving to a colder core gets you a 1000 penalty */ if (!cpus_intersects(irq->old_mask, cpu->mask)) bonus = CROSS_PACKAGE_PENALTY/3; /* do a little numa affinity */ - if (!cpus_intersects(irq->numa_mask, cpu->mask)) - bonus += NUMA_PENALTY; + if (irq->node_num != cpu->node_num) { + if (irq->node_num >= 0 && cpu->node_num >= 0) { + dist = numa_distance(irq->node_num, cpu->node_num); + /* moving to a distant numa node results into penalty */ + bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0; + } + } /* but if the irq has had 0 interrupts for a while move it about more easily */ if (irq->workload==0) diff --git a/procinterrupts.c b/procinterrupts.c index e336efe..67da9a8 100644 --- a/procinterrupts.c +++ b/procinterrupts.c @@ -82,6 +82,22 @@ void parse_proc_interrupts(void) need_cpu_rescan = 1; set_interrupt_count(number, count); + + /* is interrupt MSI based? */ + while (*c && *c == ' ') + c++; + if (strstr(c, "PCI-MSI") != NULL) { + while (*c && *c != ' ') + c++; + while (*c && *c == ' ') + c++; + if (c) { + /* Set numa node for irq if it was MSI */ + if (debug_mode) + printf("Set MSI interrupt for %d\n", number); + set_msi_interrupt_numa(number, c); + } + } } fclose(file); free(line); diff --git a/types.h b/types.h index b986fe6..d60bb46 100644 --- a/types.h +++ b/types.h @@ -24,6 +24,7 @@ struct package { int number; cpumask_t mask; + int node_num; int class_count[7]; @@ -36,6 +37,7 @@ struct cache_domain { int number; int marker; + int node_num; cpumask_t mask; @@ -53,6 +55,7 @@ struct cpu_core { int number; int marker; + int node_num; int class_count[7]; @@ -70,6 +73,8 @@ struct interrupt { int number; int class; + int node_num; + int msi; uint64_t count; uint64_t old_count;