irqbalance/classify.c
Neil Horman 6e6ac7bc65 policyscript: Add ability to specify a directory for multiple scripts
The policyscript directive allows for the specifcaion of a single script
to define policy for all hardware on a system, which is good as a
site/host specific utility, but it makes for difficult work in the event
that vendors wish to provide guidance for only their own hardware (i.e.
if vendor A wants certain hardware to follow affinity_hinting without
affecting other hardware).  To manage this, lets enhance policyscript to
allow the specification of an entire directory, to which multiple
scripts can be added.  Semantics for this new directory feature are the
same as for the single script case, except that the script exit codes
have additional meaning:

exit code 0 - the script indicates that the referenced irq relates to a
device that this script recognizes and further script processing should
stop

exit code 1 - the script indicates that the referenced irq does not
relate to a device the script recognizes, and script processing should
continue

exit code >2 - the script indicates an error has occured, and any output
from it should be ignored, script processing should continue

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
2018-07-09 12:54:27 -04:00

871 lines
18 KiB
C

#include "config.h"
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <assert.h>
#include <errno.h>
#include "irqbalance.h"
#include "types.h"
char *classes[] = {
"other",
"legacy",
"storage",
"video",
"ethernet",
"gbit-ethernet",
"10gbit-ethernet",
"virt-event",
0
};
static int map_class_to_level[8] =
{ BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE };
struct user_irq_policy {
int ban;
int level;
int numa_node_set;
int numa_node;
};
static GList *interrupts_db = NULL;
static GList *banned_irqs = NULL;
GList *cl_banned_irqs = NULL;
static GList *cl_banned_modules = NULL;
#define SYSFS_DIR "/sys"
#define SYSPCI_DIR "/sys/bus/pci/devices"
#define PCI_MAX_CLASS 0x14
#define PCI_MAX_SERIAL_SUBCLASS 0x81
#define PCI_INVAL_DATA 0xFFFFFFFF
struct pci_info {
unsigned short vendor;
unsigned short device;
unsigned short sub_vendor;
unsigned short sub_device;
unsigned int class;
};
/* PCI vendor ID, device ID */
#define PCI_VENDOR_PLX 0x10b5
#define PCI_DEVICE_PLX_PEX8619 0x8619
#define PCI_VENDOR_CAVIUM 0x177d
#define PCI_DEVICE_CAVIUM_CN61XX 0x0093
/* PCI subsystem vendor ID, subsystem device ID */
#define PCI_SUB_VENDOR_EMC 0x1120
#define PCI_SUB_DEVICE_EMC_055B 0x055b
#define PCI_SUB_DEVICE_EMC_0568 0x0568
#define PCI_SUB_DEVICE_EMC_dd00 0xdd00
/*
* Apply software workarounds for some special devices
*
* The world is not perfect and supplies us with broken PCI devices.
* Usually there are two sort of cases:
*
* 1. The device is special
* Before shipping the devices, PCI spec doesn't have the definitions.
*
* 2. Buggy PCI devices
* Some PCI devices don't follow the PCI class code definitions.
*/
static void apply_pci_quirks(const struct pci_info *pci, int *irq_class)
{
if ((pci->vendor == PCI_VENDOR_PLX) &&
(pci->device == PCI_DEVICE_PLX_PEX8619) &&
(pci->sub_vendor == PCI_SUB_VENDOR_EMC)) {
switch (pci->sub_device) {
case PCI_SUB_DEVICE_EMC_055B:
case PCI_SUB_DEVICE_EMC_dd00:
*irq_class = IRQ_SCSI;
break;
}
}
if ((pci->vendor == PCI_VENDOR_CAVIUM) &&
(pci->device == PCI_DEVICE_CAVIUM_CN61XX) &&
(pci->sub_vendor == PCI_SUB_VENDOR_EMC)) {
switch (pci->sub_device) {
case PCI_SUB_DEVICE_EMC_0568:
*irq_class = IRQ_SCSI;
break;
}
}
return;
}
/* Determin IRQ class based on PCI class code */
static int map_pci_irq_class(unsigned int pci_class)
{
unsigned int major = pci_class >> 16;
unsigned int sub = (pci_class & 0xFF00) >> 8;
int irq_class = IRQ_NODEF;
/*
* Class codes lifted from below PCI-SIG spec:
*
* PCI Code and ID Assignment Specification v1.5
*
* and mapped to irqbalance types here.
*
* IRQ_NODEF will go through classification by PCI sub-class code.
*/
static short major_class_codes[PCI_MAX_CLASS] = {
IRQ_OTHER,
IRQ_SCSI,
IRQ_ETH,
IRQ_VIDEO,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_NODEF,
IRQ_ETH,
IRQ_SCSI,
IRQ_OTHER,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_LEGACY,
};
/*
* All sub-class code for serial bus controllers.
* The major class code is 0xc.
*/
static short serial_sub_codes[PCI_MAX_SERIAL_SUBCLASS] = {
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_SCSI,
IRQ_LEGACY,
IRQ_SCSI,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
[0xa ... 0x7f] = IRQ_NODEF,
IRQ_LEGACY,
};
/*
* Check major class code first
*/
if (major >= PCI_MAX_CLASS)
return IRQ_NODEF;
switch (major) {
case 0xc: /* Serial bus class */
if (sub >= PCI_MAX_SERIAL_SUBCLASS)
return IRQ_NODEF;
irq_class = serial_sub_codes[sub];
break;
default: /* All other PCI classes */
irq_class = major_class_codes[major];
break;
}
return irq_class;
}
/* Read specific data from sysfs */
static unsigned int read_pci_data(const char *devpath, const char* file)
{
char path[PATH_MAX];
FILE *fd;
unsigned int data = PCI_INVAL_DATA;
sprintf(path, "%s/%s", devpath, file);
fd = fopen(path, "r");
if (!fd) {
log(TO_CONSOLE, LOG_WARNING, "PCI: can't open file:%s\n", path);
return data;
}
(void) fscanf(fd, "%x", &data);
fclose(fd);
return data;
}
/* Get pci information for IRQ classification */
static int get_pci_info(const char *devpath, struct pci_info *pci)
{
unsigned int data = PCI_INVAL_DATA;
if ((data = read_pci_data(devpath, "vendor")) == PCI_INVAL_DATA)
return -ENODEV;
pci->vendor = (unsigned short)data;
if ((data = read_pci_data(devpath, "device")) == PCI_INVAL_DATA)
return -ENODEV;
pci->device = (unsigned short)data;
if ((data = read_pci_data(devpath, "subsystem_vendor")) == PCI_INVAL_DATA)
return -ENODEV;
pci->sub_vendor = (unsigned short)data;
if ((data = read_pci_data(devpath, "subsystem_device")) == PCI_INVAL_DATA)
return -ENODEV;
pci->sub_device = (unsigned short)data;
if ((data = read_pci_data(devpath, "class")) == PCI_INVAL_DATA)
return -ENODEV;
pci->class = data;
return 0;
}
/* Return IRQ class for given devpath */
static int get_irq_class(const char *devpath)
{
int irq_class = IRQ_NODEF;
struct pci_info pci;
/* Get PCI info from sysfs */
if (get_pci_info(devpath, &pci) < 0)
return IRQ_NODEF;
/* Map PCI class code to irq class */
irq_class = map_pci_irq_class(pci.class);
if (irq_class < 0) {
log(TO_CONSOLE, LOG_WARNING, "Invalid PCI class code %d\n",
pci.class);
return IRQ_NODEF;
}
/* Reassign irq class for some buggy devices */
apply_pci_quirks(&pci, &irq_class);
return irq_class;
}
static gint compare_ints(gconstpointer a, gconstpointer b)
{
const struct irq_info *ai = a;
const struct irq_info *bi = b;
return ai->irq - bi->irq;
}
static void add_banned_irq(int irq, GList **list)
{
struct irq_info find, *new;
GList *entry;
find.irq = irq;
entry = g_list_find_custom(*list, &find, compare_ints);
if (entry)
return;
new = calloc(sizeof(struct irq_info), 1);
if (!new) {
log(TO_CONSOLE, LOG_WARNING, "No memory to ban irq %d\n", irq);
return;
}
new->irq = irq;
new->flags |= IRQ_FLAG_BANNED;
*list = g_list_append(*list, new);
log(TO_CONSOLE, LOG_INFO, "IRQ %d was BANNED.\n", irq);
return;
}
void add_cl_banned_irq(int irq)
{
add_banned_irq(irq, &cl_banned_irqs);
}
static int is_banned_irq(int irq)
{
GList *entry;
struct irq_info find;
find.irq = irq;
entry = g_list_find_custom(banned_irqs, &find, compare_ints);
return entry ? 1:0;
}
gint substr_find(gconstpointer a, gconstpointer b)
{
if (strstr(b, a))
return 0;
else
return 1;
}
static void add_banned_module(char *modname, GList **modlist)
{
GList *entry;
char *newmod;
entry = g_list_find_custom(*modlist, modname, substr_find);
if (entry)
return;
newmod = strdup(modname);
if (!newmod) {
log(TO_CONSOLE, LOG_WARNING, "No memory to ban module %s\n", modname);
return;
}
*modlist = g_list_append(*modlist, newmod);
}
void add_cl_banned_module(char *modname)
{
add_banned_module(modname, &cl_banned_modules);
}
/*
* Inserts an irq_info struct into the intterupts_db list
* devpath points to the device directory in sysfs for the
* related device. NULL devpath means no sysfs entries for
* this irq.
*/
static struct irq_info *add_one_irq_to_db(const char *devpath, int irq, struct user_irq_policy *pol)
{
int irq_class = IRQ_OTHER;
struct irq_info *new, find;
int numa_node;
char path[PATH_MAX];
FILE *fd;
char *lcpu_mask;
GList *entry;
ssize_t ret;
size_t blen;
/*
* First check to make sure this isn't a duplicate entry
*/
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (entry) {
log(TO_CONSOLE, LOG_INFO, "DROPPING DUPLICATE ENTRY FOR IRQ %d on path %s\n", irq, devpath);
return NULL;
}
if (is_banned_irq(irq)) {
log(TO_ALL, LOG_INFO, "SKIPPING BANNED IRQ %d\n", irq);
return NULL;
}
new = calloc(sizeof(struct irq_info), 1);
if (!new)
return NULL;
new->irq = irq;
new->class = IRQ_OTHER;
interrupts_db = g_list_append(interrupts_db, new);
/* Some special irqs have NULL devpath */
if (devpath != NULL) {
/* Map PCI class code to irq class */
irq_class = get_irq_class(devpath);
if (irq_class < 0)
goto get_numa_node;
}
new->class = irq_class;
if (pol->level >= 0)
new->level = pol->level;
else
new->level = map_class_to_level[irq_class];
get_numa_node:
numa_node = -1;
if (numa_avail) {
sprintf(path, "%s/numa_node", devpath);
fd = fopen(path, "r");
if (fd) {
fscanf(fd, "%d", &numa_node);
fclose(fd);
}
}
if (pol->numa_node_set == 1)
new->numa_node = get_numa_node(pol->numa_node);
else
new->numa_node = get_numa_node(numa_node);
sprintf(path, "%s/local_cpus", devpath);
fd = fopen(path, "r");
if (!fd) {
cpus_setall(new->cpumask);
goto out;
}
lcpu_mask = NULL;
ret = getline(&lcpu_mask, &blen, fd);
fclose(fd);
if (ret <= 0) {
cpus_setall(new->cpumask);
} else {
cpumask_parse_user(lcpu_mask, ret, new->cpumask);
}
free(lcpu_mask);
out:
log(TO_CONSOLE, LOG_INFO, "Adding IRQ %d to database\n", irq);
return new;
}
static void parse_user_policy_key(char *buf, int irq, struct user_irq_policy *pol)
{
char *key, *value, *end;
char *levelvals[] = { "none", "package", "cache", "core" };
int idx;
int key_set = 1;
key = buf;
value = strchr(buf, '=');
if (!value) {
log(TO_SYSLOG, LOG_WARNING, "Bad format for policy, ignoring: %s\n", buf);
return;
}
/* NULL terminate the key and advance value to the start of the value
* string
*/
*value = '\0';
value++;
end = strchr(value, '\n');
if (end)
*end = '\0';
if (!strcasecmp("ban", key)) {
if (!strcasecmp("false", value))
pol->ban = 0;
else if (!strcasecmp("true", value))
pol->ban = 1;
else {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Unknown value for ban policy: %s\n", value);
}
} else if (!strcasecmp("balance_level", key)) {
for (idx=0; idx<4; idx++) {
if (!strcasecmp(levelvals[idx], value))
break;
}
if (idx>3) {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Bad value for balance_level policy: %s\n", value);
} else
pol->level = idx;
} else if (!strcasecmp("numa_node", key)) {
idx = strtoul(value, NULL, 10);
if (!get_numa_node(idx)) {
log(TO_ALL, LOG_WARNING, "NUMA node %d doesn't exist\n",
idx);
return;
}
pol->numa_node = idx;
pol->numa_node_set = 1;
} else {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Unknown key returned, ignoring: %s\n", key);
}
if (key_set)
log(TO_ALL, LOG_INFO, "IRQ %d: Override %s to %s\n", irq, key, value);
}
static int run_script_for_policy(char *script, char *path, int irq, struct user_irq_policy *pol)
{
char *cmd;
char *brc;
FILE *output;
char buffer[128];
cmd = alloca(strlen(path)+strlen(script)+64);
if (!cmd)
return -1;
sprintf(cmd, "exec %s %s %d", script, path, irq);
output = popen(cmd, "r");
if (!output) {
log(TO_ALL, LOG_WARNING, "Unable to execute user policy script %s\n", script);
return 1; /* tell caller to ignore this script */
}
while(!feof(output)) {
brc = fgets(buffer, 128, output);
if (brc)
parse_user_policy_key(brc, irq, pol);
}
return WEXITSTATUS(pclose(output));
}
/*
* Calls out to a possibly user defined script to get user assigned policy
* aspects for a given irq. A value of -1 in a given field indicates no
* policy was given and that system defaults should be used
*/
static void get_irq_user_policy(char *path, int irq, struct user_irq_policy *pol)
{
struct stat sbuf;
DIR *poldir;
struct dirent *entry;
int ret;
char script[1024];
memset(pol, -1, sizeof(struct user_irq_policy));
/* Return defaults if no script was given */
if (!polscript)
return;
if (stat(polscript, &sbuf))
return;
/* Use SYSFS_DIR for irq has no sysfs entries */
if (!path)
path = SYSFS_DIR;
if (!S_ISDIR(sbuf.st_mode)) {
if (run_script_for_policy(polscript, path, irq, pol) != 0) {
log(TO_CONSOLE, LOG_ERR, "policy script returned non-zero code! skipping user policy\n");
memset(pol, -1, sizeof(struct user_irq_policy));
}
} else {
/* polscript is a directory, user multiple script semantics */
poldir = opendir(polscript);
if (poldir) {
while ((entry = readdir(poldir)) != NULL) {
snprintf(script, sizeof(script), "%s/%s", polscript, entry->d_name);
if (stat(script, &sbuf))
continue;
if (S_ISREG(sbuf.st_mode)) {
memset(pol, -1, sizeof(struct user_irq_policy));
ret = run_script_for_policy(script, path, irq, pol);
if ((ret < 0) || (ret >= 2)) {
log(TO_CONSOLE, LOG_ERR, "Error executing policy script %s : %d\n", script, ret);
continue;
}
/* a ret of 1 means this script isn't
* for this irq
*/
if (ret == 1)
continue;
log(TO_CONSOLE, LOG_DEBUG, "Accepting script %s to define policy for irq %d\n", script, irq);
break;
}
}
}
}
}
static int check_for_module_ban(char *name)
{
GList *entry;
entry = g_list_find_custom(cl_banned_modules, name, substr_find);
if (entry)
return 1;
else
return 0;
}
static int check_for_irq_ban(char *path __attribute__((unused)), int irq, GList *proc_interrupts)
{
struct irq_info find, *res;
GList *entry;
/*
* Check to see if we banned this irq on the command line
*/
find.irq = irq;
entry = g_list_find_custom(cl_banned_irqs, &find, compare_ints);
if (entry)
return 1;
/*
* Check to see if we banned module which the irq belongs to.
*/
entry = g_list_find_custom(proc_interrupts, &find, compare_ints);
if (entry) {
res = entry->data;
if (check_for_module_ban(res->name))
return 1;
}
return 0;
}
/*
* Figures out which interrupt(s) relate to the device we"re looking at in dirname
*/
static void build_one_dev_entry(const char *dirname, GList *tmp_irqs)
{
struct dirent *entry;
DIR *msidir;
FILE *fd;
int irqnum;
struct irq_info *new;
char path[PATH_MAX];
char devpath[PATH_MAX];
struct user_irq_policy pol;
sprintf(path, "%s/%s/msi_irqs", SYSPCI_DIR, dirname);
sprintf(devpath, "%s/%s", SYSPCI_DIR, dirname);
msidir = opendir(path);
if (msidir) {
do {
entry = readdir(msidir);
if (!entry)
break;
irqnum = strtol(entry->d_name, NULL, 10);
if (irqnum) {
new = get_irq_info(irqnum);
if (new)
continue;
get_irq_user_policy(devpath, irqnum, &pol);
if ((pol.ban == 1) || (check_for_irq_ban(devpath, irqnum, tmp_irqs))) {
add_banned_irq(irqnum, &banned_irqs);
continue;
}
new = add_one_irq_to_db(devpath, irqnum, &pol);
if (!new)
continue;
new->type = IRQ_TYPE_MSIX;
}
} while (entry != NULL);
closedir(msidir);
return;
}
sprintf(path, "%s/%s/irq", SYSPCI_DIR, dirname);
fd = fopen(path, "r");
if (!fd)
return;
if (fscanf(fd, "%d", &irqnum) < 0)
goto done;
/*
* no pci device has irq 0
* irq 255 is invalid on x86/x64 architectures
*/
#if defined(__i386__) || defined(__x86_64__)
if (irqnum && irqnum != 255) {
#else
if (irqnum) {
#endif
new = get_irq_info(irqnum);
if (new)
goto done;
get_irq_user_policy(devpath, irqnum, &pol);
if ((pol.ban == 1) || (check_for_irq_ban(path, irqnum, tmp_irqs))) {
add_banned_irq(irqnum, &banned_irqs);
goto done;
}
new = add_one_irq_to_db(devpath, irqnum, &pol);
if (!new)
goto done;
new->type = IRQ_TYPE_LEGACY;
}
done:
fclose(fd);
return;
}
static void free_irq(struct irq_info *info, void *data __attribute__((unused)))
{
free(info);
}
void free_irq_db(void)
{
for_each_irq(NULL, free_irq, NULL);
g_list_free(interrupts_db);
interrupts_db = NULL;
for_each_irq(banned_irqs, free_irq, NULL);
g_list_free(banned_irqs);
banned_irqs = NULL;
g_list_free(rebalance_irq_list);
rebalance_irq_list = NULL;
}
void free_cl_opts(void)
{
g_list_free_full(cl_banned_modules, free);
g_list_free_full(cl_banned_irqs, free);
g_list_free(banned_irqs);
}
static void add_new_irq(int irq, struct irq_info *hint, GList *proc_interrupts)
{
struct irq_info *new;
struct user_irq_policy pol;
new = get_irq_info(irq);
if (new)
return;
/* Set NULL devpath for the irq has no sysfs entries */
get_irq_user_policy(NULL, irq, &pol);
if ((pol.ban == 1) || check_for_irq_ban(NULL, irq, proc_interrupts)) { /*FIXME*/
add_banned_irq(irq, &banned_irqs);
new = get_irq_info(irq);
} else
new = add_one_irq_to_db(NULL, irq, &pol);
if (!new) {
log(TO_CONSOLE, LOG_WARNING, "add_new_irq: Failed to add irq %d\n", irq);
return;
}
/*
* Override some of the new irq defaults here
*/
if (hint) {
new->type = hint->type;
new->class = hint->class;
}
new->level = map_class_to_level[new->class];
}
static void add_missing_irq(struct irq_info *info, void *attr)
{
struct irq_info *lookup = get_irq_info(info->irq);
GList *proc_interrupts = (GList *) attr;
if (!lookup)
add_new_irq(info->irq, info, proc_interrupts);
}
void rebuild_irq_db(void)
{
DIR *devdir;
struct dirent *entry;
GList *tmp_irqs = NULL;
free_irq_db();
tmp_irqs = collect_full_irq_list();
devdir = opendir(SYSPCI_DIR);
if (devdir) {
do {
entry = readdir(devdir);
if (!entry)
break;
build_one_dev_entry(entry->d_name, tmp_irqs);
} while (entry != NULL);
closedir(devdir);
}
for_each_irq(tmp_irqs, add_missing_irq, interrupts_db);
g_list_free_full(tmp_irqs, free);
}
void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data)
{
GList *entry = g_list_first(list ? list : interrupts_db);
GList *next;
while (entry) {
next = g_list_next(entry);
cb(entry->data, data);
entry = next;
}
}
struct irq_info *get_irq_info(int irq)
{
GList *entry;
struct irq_info find;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry)
entry = g_list_find_custom(banned_irqs, &find, compare_ints);
return entry ? entry->data : NULL;
}
void migrate_irq(GList **from, GList **to, struct irq_info *info)
{
GList *entry;
struct irq_info find, *tmp;
find.irq = info->irq;
entry = g_list_find_custom(*from, &find, compare_ints);
if (!entry)
return;
tmp = entry->data;
*from = g_list_delete_link(*from, entry);
*to = g_list_append(*to, tmp);
info->moved = 1;
}
static gint sort_irqs(gconstpointer A, gconstpointer B)
{
struct irq_info *a, *b;
a = (struct irq_info*)A;
b = (struct irq_info*)B;
if (a->class < b->class)
return 1;
if (a->class > b->class)
return -1;
if (a->load < b->load)
return 1;
if (a->load > b->load)
return -1;
if (a < b)
return 1;
return -1;
}
void sort_irq_list(GList **list)
{
*list = g_list_sort(*list, sort_irqs);
}