irqbalance/placement.c
nhorman 565db19e3d Enhance irqbalance logic to condier PCI bus topology in IRQ mapping
This patch enhances irqbalance so that, it considers the proximity of devices to
cpus when making irq mapping decisions.  In large numa systems this will cause
irqs to be biased such that they have unique affinity for cpus on the same numa
node as the device triggering them.

Resolves:
http://code.google.com/p/irqbalance/issues/detail?id=17

Thanks to Petr Holasek for authoring the patch




git-svn-id: https://irqbalance.googlecode.com/svn/trunk@34 46b42954-3823-0410-bd82-eb80b452c9b5
2011-08-01 18:25:02 +00:00

385 lines
10 KiB
C

/*
* Copyright (C) 2006, Intel Corporation
*
* This file is part of irqbalance
*
* This program file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program in a file named COPYING; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include "types.h"
#include "irqbalance.h"
int power_mode;
extern GList *interrupts, *packages, *cache_domains, *cpus;
static uint64_t package_cost_func(struct interrupt *irq, struct package *package)
{
int bonus = 0;
int maxcount;
int dist;
/* moving to a cold package/cache/etc gets you a 3000 penalty */
if (!cpus_intersects(irq->old_mask, package->mask))
bonus = CROSS_PACKAGE_PENALTY;
/* do a little numa affinity */
if (irq->node_num != package->node_num) {
if (irq->node_num >= 0 && package->node_num >= 0) {
dist = numa_distance(irq->node_num, package->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
if (power_mode)
bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number;
/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
if (package->class_count[irq->class]>=maxcount && !power_mode)
bonus += 300000;
/* if the package has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, package->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
{
int bonus = 0;
int dist;
/* moving to a cold cache gets you a 1500 penalty */
if (!cpus_intersects(irq->old_mask, cache_domain->mask))
bonus = CROSS_PACKAGE_PENALTY/2;
/* do a little numa affinity */
if (irq->node_num != cache_domain->node_num) {
if (irq->node_num >= 0 && cache_domain->node_num >= 0) {
dist = numa_distance(irq->node_num, cache_domain->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
/* try to avoid having a lot of MSI interrupt (globally, no by devide id) on
* cache domain */
if (irq->msi == 1)
bonus += MSI_CACHE_PENALTY * cache_domain->class_count[irq->class];
/* if the cache domain has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
{
int bonus = 0;
int dist;
/* moving to a colder core gets you a 1000 penalty */
if (!cpus_intersects(irq->old_mask, cpu->mask))
bonus = CROSS_PACKAGE_PENALTY/3;
/* do a little numa affinity */
if (irq->node_num != cpu->node_num) {
if (irq->node_num >= 0 && cpu->node_num >= 0) {
dist = numa_distance(irq->node_num, cpu->node_num);
/* moving to a distant numa node results into penalty */
bonus += (dist > 10) ? NUMA_PENALTY * (dist-10) : 0;
}
}
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/*
* since some chipsets only place at the first cpu, give a tiny preference to non-first
* cpus for specifically placed interrupts
*/
if (first_cpu(cpu->cache_mask)==cpu->number)
bonus++;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
/* if the core has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cpu->mask))
bonus += 600000;
return irq->workload + bonus;
}
static void place_cache_domain(struct package *package)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cache_domain *cache_domain;
iter = g_list_first(package->interrupts);
while (iter) {
struct cache_domain *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level <= BALANCE_PACKAGE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(package->cache_domains);
while (pkg) {
uint64_t newload;
cache_domain = pkg->data;
newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain);
if (newload < best_cost) {
best = cache_domain;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
package->interrupts = g_list_delete_link(package->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
}
}
static void place_core(struct cache_domain *cache_domain)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cpu_core *cpu;
iter = g_list_first(cache_domain->interrupts);
while (iter) {
struct cpu_core *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
/* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */
if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(cache_domain->cpu_cores);
while (pkg) {
uint64_t newload;
cpu = pkg->data;
newload = cpu->workload + cpu_cost_func(irq, cpu);
if (newload < best_cost) {
best = cpu;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
}
}
static void place_packages(GList *list)
{
GList *iter;
GList *pkg;
struct interrupt *irq;
struct package *package;
iter = g_list_first(list);
while (iter) {
struct package *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(packages);
while (pkg) {
uint64_t newload;
package = pkg->data;
newload = package->workload + package_cost_func(irq, package);
if (newload < best_cost) {
best = package;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
}
iter = g_list_next(iter);
}
}
static void place_affinity_hint(GList *list)
{
/* still need to balance best workload within the affinity_hint mask */
GList *iter;
struct interrupt *irq;
iter = g_list_first(list);
while (iter) {
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
if ((!cpus_empty(irq->node_mask)) &&
(!cpus_equal(irq->mask, irq->node_mask)) &&
(!__cpus_full(&irq->node_mask, num_possible_cpus()))) {
irq->old_mask = irq->mask;
irq->mask = irq->node_mask;
}
iter = g_list_next(iter);
}
}
static void do_unroutables(void)
{
struct package *package;
struct cache_domain *cache_domain;
struct cpu_core *cpu;
struct interrupt *irq;
GList *iter, *inter;
inter = g_list_first(interrupts);
while (inter) {
irq = inter->data;
inter = g_list_next(inter);
if (irq->balance_level != BALANCE_NONE)
continue;
iter = g_list_first(packages);
while (iter) {
package = iter->data;
if (cpus_intersects(package->mask, irq->node_mask) ||
cpus_intersects(package->mask, irq->mask))
package->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
if (cpus_intersects(cache_domain->mask, irq->node_mask)
|| cpus_intersects(cache_domain->mask, irq->mask))
cache_domain->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cpus);
while (iter) {
cpu = iter->data;
if (cpus_intersects(cpu->mask, irq->node_mask) ||
cpus_intersects(cpu->mask, irq->mask))
cpu->workload += irq->workload;
iter = g_list_next(iter);
}
}
}
void calculate_placement(void)
{
struct package *package;
struct cache_domain *cache_domain;
GList *iter;
/* first clear old data */
clear_work_stats();
sort_irq_list();
do_unroutables();
place_packages(interrupts);
iter = g_list_first(packages);
while (iter) {
package = iter->data;
place_cache_domain(package);
iter = g_list_next(iter);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
place_core(cache_domain);
iter = g_list_next(iter);
}
/*
* if affinity_hint is populated on irq and is not set to
* all CPUs (meaning it's initialized), honor that above
* anything in the package locality/workload.
*/
place_affinity_hint(interrupts);
}