Enhance irqbalance logic to condier PCI bus topology in IRQ mapping

This patch enhances irqbalance so that, it considers the proximity of devices to
cpus when making irq mapping decisions.  In large numa systems this will cause
irqs to be biased such that they have unique affinity for cpus on the same numa
node as the device triggering them.

Resolves:
http://code.google.com/p/irqbalance/issues/detail?id=17

Thanks to Petr Holasek for authoring the patch




git-svn-id: https://irqbalance.googlecode.com/svn/trunk@34 46b42954-3823-0410-bd82-eb80b452c9b5
This commit is contained in:
nhorman 2011-08-01 18:25:02 +00:00
parent 9d23cccf32
commit 565db19e3d
11 changed files with 201 additions and 29 deletions

View file

@ -24,7 +24,7 @@ AUTOMAKE_OPTIONS = no-dependencies
EXTRA_DIST = README INSTALL COPYING autogen.sh cap-ng.m4
INCLUDES = -I${top_srcdir}
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS)
LIBS = $(CAPNG_LDADD) $(GLIB_LIBS) -lnuma
AM_CFLAGS = -g -Os -W -Wall -Wshadow -Wformat -Wundef $(GLIB_CFLAGS) -D_GNU_SOURCE
noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \
types.h

View file

@ -16,9 +16,10 @@
/* balancing tunings */
#define CROSS_PACKAGE_PENALTY 3000
#define NUMA_PENALTY 250
#define NUMA_PENALTY 500
#define POWER_MODE_PACKAGE_THRESHOLD 20000
#define CLASS_VIOLATION_PENTALTY 6000
#define MSI_CACHE_PENALTY 10000
#define CORE_SPECIFIC_THRESHOLD 5000
/* power mode */

View file

@ -55,6 +55,39 @@ cpumask_t cpu_possible_map;
*/
static cpumask_t unbanned_cpus;
static int search_numa_node(cpumask_t mask)
{
int node_num, ret;
struct bitmask *node_mask;
cpumask_t cpu_node_mask;
node_num = numa_num_configured_nodes();
if (node_num < 1)
return -1;
node_mask = numa_allocate_cpumask();
node_num--; /* indexing from zero */
while (node_num >= 0) {
ret = numa_node_to_cpus(node_num, node_mask);
if (ret) {
node_num--;
continue;
}
memcpy(cpu_node_mask.bits, node_mask->maskp, BITS_TO_LONGS(node_mask->size)*sizeof(unsigned long));
if (cpus_intersects(mask, cpu_node_mask)) {
numa_free_cpumask(node_mask);
return node_num;
}
node_num--;
}
numa_free_cpumask(node_mask);
return node_num;
}
static void fill_packages(void)
{
GList *entry;
@ -76,6 +109,7 @@ static void fill_packages(void)
memset(package, 0, sizeof(struct package));
package->mask = cache->package_mask;
package->number = cache->number;
package->node_num = search_numa_node(package->mask);
while (entry2) {
struct cache_domain *cache2;
cache2 = entry2->data;
@ -113,6 +147,7 @@ static void fill_cache_domain(void)
cache->mask = cpu->cache_mask;
cache->package_mask = cpu->package_mask;
cache->number = cpu->number;
cache->node_num = search_numa_node(cache->mask);
cache_domains = g_list_append(cache_domains, cache);
cache_domain_count++;
while (entry2) {
@ -164,6 +199,9 @@ static void do_one_cpu(char *path)
cpu_set(cpu->number, cpu->mask);
/* set numa node of cpu */
cpu->node_num = search_numa_node(cpu->mask);
/* if the cpu is on the banned list, just don't add it */
if (cpus_intersects(cpu->mask, banned_cpus)) {
free(cpu);
@ -229,7 +267,7 @@ static void dump_irqs(int spaces, GList *dump_interrupts)
int i;
for (i=0; i<spaces; i++) printf(" ");
irq = dump_interrupts->data;
printf("Interrupt %i (%s/%u) \n", irq->number, classes[irq->class], (unsigned int)irq->workload);
printf("Interrupt %i node_num is %d (%s/%u) \n", irq->number, irq->node_num, classes[irq->class], (unsigned int)irq->workload);
dump_interrupts = g_list_next(dump_interrupts);
}
}
@ -246,18 +284,18 @@ void dump_tree(void)
while (p_iter) {
package = p_iter->data;
cpumask_scnprintf(buffer, 4096, package->mask);
printf("Package %i: cpu mask is %s (workload %lu)\n", package->number, buffer, (unsigned long)package->workload);
printf("Package %i: numa_node is %d cpu mask is %s (workload %lu)\n", package->number, package->node_num, buffer, (unsigned long)package->workload);
c_iter = g_list_first(package->cache_domains);
while (c_iter) {
cache_domain = c_iter->data;
c_iter = g_list_next(c_iter);
cpumask_scnprintf(buffer, 4095, cache_domain->mask);
printf(" Cache domain %i: cpu mask is %s (workload %lu) \n", cache_domain->number, buffer, (unsigned long)cache_domain->workload);
printf(" Cache domain %i: numa_node is %d cpu mask is %s (workload %lu) \n", cache_domain->number, cache_domain->node_num, buffer, (unsigned long)cache_domain->workload);
cp_iter = cache_domain->cpu_cores;
while (cp_iter) {
cpu = cp_iter->data;
cp_iter = g_list_next(cp_iter);
printf(" CPU number %i (workload %lu)\n", cpu->number, (unsigned long)cpu->workload);
printf(" CPU number %i numa_node is %d (workload %lu)\n", cpu->number, cpu->node_num , (unsigned long)cpu->workload);
dump_irqs(18, cpu->interrupts);
}
dump_irqs(10, cache_domain->interrupts);

View file

@ -31,6 +31,7 @@
int one_shot_mode;
int debug_mode;
int numa_avail;
int need_cpu_rescan;
@ -70,6 +71,14 @@ int main(int argc, char** argv)
if (getenv("IRQBALANCE_DEBUG"))
debug_mode=1;
if (numa_available() > -1) {
numa_avail = 1;
} else {
if (debug_mode)
printf("This machine seems not NUMA capable.\n");
}
parse_cpu_tree();
@ -131,8 +140,7 @@ int main(int argc, char** argv)
/* to cope with dynamic configurations we scan for new numa information
* once every 5 minutes
*/
if (counter % NUMA_REFRESH_INTERVAL == 16)
pci_numa_scan();
pci_numa_scan();
calculate_placement();
activate_mapping();

View file

@ -10,6 +10,7 @@
#include <glib.h>
#include "types.h"
#include <numa.h>
struct interrupt;
@ -25,14 +26,15 @@ extern int need_cpu_rescan;
extern int one_shot_mode;
extern GList *interrupts;
extern void parse_cpu_tree(void);
extern void clear_work_stats(void);
extern void parse_proc_interrupts(void);
extern void set_interrupt_count(int number, uint64_t count);
extern void set_msi_interrupt_numa(int number, char *devname);
extern void add_interrupt_count(int number, uint64_t count, int type);
extern int find_class(struct interrupt *irq, char *string);
extern void add_interrupt_numa(int number, cpumask_t mask, int type);
extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type);
int dev_to_node(char *devname);
void calculate_workload(void);
void reset_counts(void);

View file

@ -146,6 +146,31 @@ static void investigate(struct interrupt *irq, int number)
} while (c!=c2 && c2!=NULL);
}
/* Set numa node number for MSI interrupt;
* Assumes existing irq metadata
*/
void set_msi_interrupt_numa(int number, char *devname)
{
GList *item;
struct interrupt *irq;
int node;
node = dev_to_node(devname);
if (node < 0)
return;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
if (irq->number == number) {
irq->node_num = node;
irq->msi = 1;
return;
}
item = g_list_next(item);
}
}
/*
* Set the number of interrupts received for a specific irq;
@ -177,6 +202,7 @@ void set_interrupt_count(int number, uint64_t count)
if (!irq)
return;
memset(irq, 0, sizeof(struct interrupt));
irq->node_num = -1;
irq->number = number;
irq->count = count;
irq->allowed_mask = CPU_MASK_ALL;
@ -217,7 +243,7 @@ void add_interrupt_count(int number, uint64_t count, int type)
* is metadata for the interrupt; do nothing if no such data
* exists.
*/
void add_interrupt_numa(int number, cpumask_t mask, int type)
void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type)
{
GList *item;
struct interrupt *irq;
@ -229,6 +255,7 @@ void add_interrupt_numa(int number, cpumask_t mask, int type)
if (irq->number == number) {
cpus_or(irq->numa_mask, irq->numa_mask, mask);
irq->node_num = node_num;
if (irq->class < type && irq->balance_level != BALANCE_NONE) {
irq->class = type;
irq->balance_level = map_class_to_level[irq->class];
@ -281,7 +308,7 @@ void dump_workloads(void)
irq = item->data;
item = g_list_next(item);
printf("Interrupt %i (class %s) has workload %lu \n", irq->number, classes[irq->class], (unsigned long)irq->workload);
printf("Interrupt %i node_num %d (class %s) has workload %lu \n", irq->number, irq->node_num, classes[irq->class], (unsigned long)irq->workload);
}
}

View file

@ -59,26 +59,19 @@ struct nic {
static GList *nics;
static int dev_to_irq(char *devname)
static int dev_to_bus(char *devname, char *busname)
{
int sock, ret;
struct ifreq ifr;
struct ethtool_value ethtool;
struct ethtool_drvinfo driver;
FILE *file;
char *line = NULL;
size_t size;
int val;
char buffer[PATH_MAX];
memset(&ifr, 0, sizeof(struct ifreq));
memset(&ethtool, 0, sizeof(struct ethtool_value));
sock = socket(AF_INET, SOCK_DGRAM, 0);
if (sock<0)
return 0;
return -1;
strcpy(ifr.ifr_name, devname);
@ -87,8 +80,24 @@ static int dev_to_irq(char *devname)
ret = ioctl(sock, SIOCETHTOOL, &ifr);
close(sock);
if (ret<0)
return -1;
strncpy(busname,driver.bus_info,63);
return 0;
}
static int dev_to_irq(char *devname)
{
FILE *file;
char *line = NULL;
size_t size;
int val;
char busname[64];
char buffer[PATH_MAX];
if (dev_to_bus(devname, busname))
return 0;
sprintf(buffer,"/sys/bus/pci/devices/%s/irq", driver.bus_info);
sprintf(buffer,"/sys/bus/pci/devices/%s/irq", busname);
file = fopen(buffer, "r");
if (!file)
return 0;
@ -105,6 +114,37 @@ static int dev_to_irq(char *devname)
return val;
}
int dev_to_node(char *devname)
{
int node, ret;
char *line = NULL;
FILE *file;
size_t size;
char busname[64];
char buffer[PATH_MAX];
ret = dev_to_bus(devname, busname);
if (ret)
return -1;
sprintf(buffer,"/sys/bus/pci/devices/%s/numa_node", busname);
file = fopen(buffer, "r");
if (!file)
return -1;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
return -1;
}
fclose(file);
node = 0;
if (line)
node = strtoul(line, NULL, 10);
free(line);
return node;
}
static struct nic *new_nic(char *name)
{
struct nic *nic;

12
numa.c
View file

@ -41,6 +41,7 @@ void pci_numa_scan(void)
char line[PATH_MAX];
FILE *file;
int irq;
int node_num;
unsigned int class;
dir = opendir("/sys/bus/pci/devices");
@ -83,6 +84,15 @@ void pci_numa_scan(void)
fclose(file);
cpumask_parse_user(line, strlen(line), mask);
/* Add numa_node file support */
sprintf(line,"/sys/bus/pci/devices/%s/numa_node", entry->d_name);
file = fopen(line, "r");
if (!file)
continue;
if (fgets(line, PATH_MAX, file)==NULL)
line[0]=0;
node_num = strtol(line, NULL, 10);
type = IRQ_OTHER;
if ((class>>16) == 0x01)
type = IRQ_SCSI;
@ -95,7 +105,7 @@ void pci_numa_scan(void)
if ((class>>16) >= 0x03 && (class>>16) <= 0x0C)
type = IRQ_LEGACY;
add_interrupt_numa(irq, mask, type);
add_interrupt_numa(irq, mask, node_num, type);
} while (entry);
closedir(dir);

View file

@ -36,13 +36,19 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package
{
int bonus = 0;
int maxcount;
int dist;
/* moving to a cold package/cache/etc gets you a 3000 penalty */
if (!cpus_intersects(irq->old_mask, package->mask))
bonus = CROSS_PACKAGE_PENALTY;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, package->mask))
bonus += NUMA_PENALTY;
if (irq->node_num != package->node_num) {
if (irq->node_num >= 0 && package->node_num >= 0) {
dist = numa_distance(irq->node_num, package->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
@ -67,13 +73,20 @@ static uint64_t package_cost_func(struct interrupt *irq, struct package *package
static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
{
int bonus = 0;
int dist;
/* moving to a cold cache gets you a 1500 penalty */
if (!cpus_intersects(irq->old_mask, cache_domain->mask))
bonus = CROSS_PACKAGE_PENALTY/2;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, cache_domain->mask))
bonus += NUMA_PENALTY;
if (irq->node_num != cache_domain->node_num) {
if (irq->node_num >= 0 && cache_domain->node_num >= 0) {
dist = numa_distance(irq->node_num, cache_domain->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
@ -83,6 +96,11 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
* cache domain */
if (irq->msi == 1)
bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
/* if the cache domain has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
bonus += 600000;
@ -93,13 +111,20 @@ static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domai
static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
{
int bonus = 0;
int dist;
/* moving to a colder core gets you a 1000 penalty */
if (!cpus_intersects(irq->old_mask, cpu->mask))
bonus = CROSS_PACKAGE_PENALTY/3;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, cpu->mask))
bonus += NUMA_PENALTY;
if (irq->node_num != cpu->node_num) {
if (irq->node_num >= 0 && cpu->node_num >= 0) {
dist = numa_distance(irq->node_num, cpu->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)

View file

@ -82,6 +82,22 @@ void parse_proc_interrupts(void)
need_cpu_rescan = 1;
set_interrupt_count(number, count);
/* is interrupt MSI based? */
while (*c && *c == ' ')
c++;
if (strstr(c, "PCI-MSI") != NULL) {
while (*c && *c != ' ')
c++;
while (*c && *c == ' ')
c++;
if (c) {
/* Set numa node for irq if it was MSI */
if (debug_mode)
printf("Set MSI interrupt for %d\n", number);
set_msi_interrupt_numa(number, c);
}
}
}
fclose(file);
free(line);

View file

@ -24,6 +24,7 @@ struct package {
int number;
cpumask_t mask;
int node_num;
int class_count[7];
@ -36,6 +37,7 @@ struct cache_domain {
int number;
int marker;
int node_num;
cpumask_t mask;
@ -53,6 +55,7 @@ struct cpu_core {
int number;
int marker;
int node_num;
int class_count[7];
@ -70,6 +73,8 @@ struct interrupt {
int number;
int class;
int node_num;
int msi;
uint64_t count;
uint64_t old_count;