irqbalance/irqbalance.h
Krister Johansen 7bc1244fbf Teach irqbalance about Intel CoD.
This originally surfaced as a bug in placing network interrupts.  In
the case that this submitter observed, the NIC card was in NUMA domain
0, but each RSS interrupt was getting an affinity list for all CPUs in
the domain.  The expected behavior is for a single cpu to be chosen when
attempting to fan out NIC interrupts.  Due to other implementation
details of interrupt placement, this effectively caused all interrupt
mappings for this NIC to end up on CPU 0.

The bug turns out ot have been caused by Intel Cluster on Die breaking
an assumption in irqbalance about the design of the component hierarchy.
The CoD topology allows a CPU package to belong to more than one NUMA
node, which is not expected.

The RCA was that when the second NUMA node was wired up to the existing
physical package, it overwrote the mappings that were placed there by
the first.

This patch attempts to solve that problem by permitting a package to
have multiple NUMA nodes.  The CPU component hierarchy is preserved, in
case other parts of the code depend upon walking it.  When a CoD
topology is detected, the NUMA node -> CPU component mapping is moved
down a level, so that the nodes point to the first level where the
affinity becomes distinct.  In practice, this has been observed to be
the LLC.

A quick illustration (now, with COD, it looks like this):

                 +-----------+
                 | NUMA Node |
                 |     0     |
                 +-----------+
                       |
                       |        +-------+
                      \|/     / | CPU 0 |
                   +---------+  +-------+
                   | Cache 0 |
                   +---------+  +-------+
                   /          \ | CPU 1 |
      +-----------+             +-------+
      | Package 0 |
      +-----------+             +-------+
                  \           / | CPU 2 |
                   +---------+  +-------+
                   | Cache 1 |
                   +---------+
                       ^      \ +-------+
                       |        | CPU 3 |
                       |        +-------+
                 +-----------+
                 | NUMA Node |
                 |     1     |
                 +-----------+

Whereas, previously only NUMA Node 1 would end up pointing to package 0.
The topology should not be different on platforms that do not enable
CoD.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
2017-07-11 09:21:04 -07:00

165 lines
3.9 KiB
C

#ifndef __INCLUDE_GUARD_IRQBALANCE_H_
#define __INCLUDE_GUARD_IRQBALANCE_H_
#include "constants.h"
#include "cpumask.h"
#include <stdint.h>
#include <glib.h>
#include <glib-unix.h>
#include <syslog.h>
#include <limits.h>
#include "types.h"
#include "config.h"
#ifdef __aarch64__
#define AARCH64
#endif
#ifdef HAVE_NUMA_H
#include <numa.h>
#else
#define numa_available() -1
#endif
#ifdef HAVE_LIBSYSTEMD
#include <systemd/sd-journal.h>
#endif
extern int package_count;
extern int cache_domain_count;
extern int core_count;
extern char *classes[];
extern void parse_cpu_tree(void);
extern void clear_work_stats(void);
extern void parse_proc_interrupts(void);
extern GList* collect_full_irq_list();
extern void parse_proc_stat(void);
extern void set_interrupt_count(int number, uint64_t count);
extern void set_msi_interrupt_numa(int number);
extern GList *rebalance_irq_list;
void update_migration_status(void);
void dump_workloads(void);
void sort_irq_list(GList **list);
void calculate_placement(void);
void dump_tree(void);
void activate_mappings(void);
void clear_cpu_tree(void);
/*===================NEW BALANCER FUNCTIONS============================*/
/*
* Master topo_obj type lists
*/
extern GList *numa_nodes;
extern GList *packages;
extern GList *cache_domains;
extern GList *cpus;
extern int numa_avail;
extern GList *cl_banned_irqs;
extern int debug_mode;
extern int journal_logging;
extern int one_shot_mode;
extern int need_rescan;
extern unsigned long long cycle_count;
extern unsigned long power_thresh;
extern unsigned long deepest_cache;
extern char *banscript;
extern char *polscript;
extern cpumask_t banned_cpus;
extern cpumask_t unbanned_cpus;
extern long HZ;
/*
* Numa node access routines
*/
extern void build_numa_node_list(void);
extern void free_numa_node_list(void);
extern void dump_numa_node_info(struct topo_obj *node, void *data);
extern void connect_cpu_mem_topo(struct topo_obj *p, void *data);
extern struct topo_obj *get_numa_node(int nodeid);
/*
* cpu core functions
*/
#define cpu_numa_node(cpu) ((cpu)->parent->numa_nodes)
extern struct topo_obj *find_cpu_core(int cpunr);
extern int get_cpu_count(void);
/*
* irq db functions
*/
extern void rebuild_irq_db(void);
extern void free_irq_db(void);
extern void add_cl_banned_irq(int irq);
extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data);
extern struct irq_info *get_irq_info(int irq);
extern void migrate_irq(GList **from, GList **to, struct irq_info *info);
extern void free_cl_opts(void);
extern void add_cl_banned_module(char *modname);
#define irq_numa_node(irq) ((irq)->numa_node)
/*
* Generic object functions
*/
static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data)
{
GList *entry, *next;
entry = g_list_first(list);
while (entry) {
next = g_list_next(entry);
cb(entry->data, data);
entry = next;
}
}
/*
* Logging functions
*/
#define TO_SYSLOG (1 << 0)
#define TO_CONSOLE (1 << 1)
#define TO_ALL (TO_SYSLOG | TO_CONSOLE)
extern const char * log_indent;
extern unsigned int log_mask;
#ifdef HAVE_LIBSYSTEMD
#define log(mask, lvl, fmt, args...) do { \
if (journal_logging) { \
sd_journal_print(lvl, fmt, ##args); \
if (log_mask & mask & TO_CONSOLE) \
printf(fmt, ##args); \
} else { \
if (log_mask & mask & TO_SYSLOG) \
syslog(lvl, fmt, ##args); \
if (log_mask & mask & TO_CONSOLE) \
printf(fmt, ##args); \
} \
}while(0)
#else /* ! HAVE_LIBSYSTEMD */
#define log(mask, lvl, fmt, args...) do { \
if (journal_logging) { \
printf("<%d>", lvl); \
printf(fmt, ##args); \
} else { \
if (log_mask & mask & TO_SYSLOG) \
syslog(lvl, fmt, ##args); \
if (log_mask & mask & TO_CONSOLE) \
printf(fmt, ##args); \
} \
}while(0)
#endif /* HAVE_LIBSYSTEMD */
#define SOCKET_PATH "irqbalance"
#endif /* __INCLUDE_GUARD_IRQBALANCE_H_ */