irqbalance/placement.c
nhorman 3c7fe6d3cc Fix affinity_hint code.
Theres a problem with the affintiy_hint code.  Specifically it fails to
determine when an affinity_hint file is all f's (affinity_hint == all cpus).  As
such, irqbalance is currently allowing all cpus to handle all irqs, which is the
antithesis of its function.  It also makes use of cpus_full, which is poorly
formed, as it assumes that the affinity_hint mask is always NR_CPUS in length,
and each bit is set (even for those cpus not actually present in the system).
This patch corrects both of those problems, by only checking all the present
cpus in the system in the mask, and detecting when that mask is all f's. 

Signed-off-by: Neil Horman  <nhorman@tuxdriver.com>




git-svn-id: https://irqbalance.googlecode.com/svn/trunk@30 46b42954-3823-0410-bd82-eb80b452c9b5
2010-08-10 13:32:58 +00:00

360 lines
9.1 KiB
C

/*
* Copyright (C) 2006, Intel Corporation
*
* This file is part of irqbalance
*
* This program file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program in a file named COPYING; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include "types.h"
#include "irqbalance.h"
int power_mode;
extern GList *interrupts, *packages, *cache_domains, *cpus;
static uint64_t package_cost_func(struct interrupt *irq, struct package *package)
{
int bonus = 0;
int maxcount;
/* moving to a cold package/cache/etc gets you a 3000 penalty */
if (!cpus_intersects(irq->old_mask, package->mask))
bonus = CROSS_PACKAGE_PENALTY;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, package->mask))
bonus += NUMA_PENALTY;
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/* in power save mode, you better be on package 0, with overflow to the next package if really needed */
if (power_mode)
bonus += POWER_MODE_PACKAGE_THRESHOLD * package->number;
/* if we're out of whack in terms of per class counts.. just block (except in power mode) */
maxcount = (class_counts[irq->class] + package_count -1 ) / package_count;
if (package->class_count[irq->class]>=maxcount && !power_mode)
bonus += 300000;
/* if the package has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, package->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cache_domain_cost_func(struct interrupt *irq, struct cache_domain *cache_domain)
{
int bonus = 0;
/* moving to a cold cache gets you a 1500 penalty */
if (!cpus_intersects(irq->old_mask, cache_domain->mask))
bonus = CROSS_PACKAGE_PENALTY/2;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, cache_domain->mask))
bonus += NUMA_PENALTY;
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cache_domain->class_count[irq->class];
/* if the cache domain has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cache_domain->mask))
bonus += 600000;
return irq->workload + bonus;
}
static uint64_t cpu_cost_func(struct interrupt *irq, struct cpu_core *cpu)
{
int bonus = 0;
/* moving to a colder core gets you a 1000 penalty */
if (!cpus_intersects(irq->old_mask, cpu->mask))
bonus = CROSS_PACKAGE_PENALTY/3;
/* do a little numa affinity */
if (!cpus_intersects(irq->numa_mask, cpu->mask))
bonus += NUMA_PENALTY;
/* but if the irq has had 0 interrupts for a while move it about more easily */
if (irq->workload==0)
bonus = bonus / 10;
/*
* since some chipsets only place at the first cpu, give a tiny preference to non-first
* cpus for specifically placed interrupts
*/
if (first_cpu(cpu->cache_mask)==cpu->number)
bonus++;
/* pay 6000 for each previous interrupt of the same class */
bonus += CLASS_VIOLATION_PENTALTY * cpu->class_count[irq->class];
/* if the core has no cpus in the allowed mask.. just block */
if (!cpus_intersects(irq->allowed_mask, cpu->mask))
bonus += 600000;
return irq->workload + bonus;
}
static void place_cache_domain(struct package *package)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cache_domain *cache_domain;
iter = g_list_first(package->interrupts);
while (iter) {
struct cache_domain *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level <= BALANCE_PACKAGE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(package->cache_domains);
while (pkg) {
uint64_t newload;
cache_domain = pkg->data;
newload = cache_domain->workload + cache_domain_cost_func(irq, cache_domain);
if (newload < best_cost) {
best = cache_domain;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
package->interrupts = g_list_delete_link(package->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
}
}
static void place_core(struct cache_domain *cache_domain)
{
GList *iter, *next;
GList *pkg;
struct interrupt *irq;
struct cpu_core *cpu;
iter = g_list_first(cache_domain->interrupts);
while (iter) {
struct cpu_core *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
/* if the irq isn't per-core policy and is not very busy, leave it at cache domain level */
if (irq->balance_level <= BALANCE_CACHE && irq->workload < CORE_SPECIFIC_THRESHOLD && !one_shot_mode) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(cache_domain->cpu_cores);
while (pkg) {
uint64_t newload;
cpu = pkg->data;
newload = cpu->workload + cpu_cost_func(irq, cpu);
if (newload < best_cost) {
best = cpu;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
next = g_list_next(iter);
cache_domain->interrupts = g_list_delete_link(cache_domain->interrupts, iter);
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
iter = next;
} else
iter = g_list_next(iter);
}
}
static void place_packages(GList *list)
{
GList *iter;
GList *pkg;
struct interrupt *irq;
struct package *package;
iter = g_list_first(list);
while (iter) {
struct package *best = NULL;
uint64_t best_cost = INT_MAX;
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
pkg = g_list_first(packages);
while (pkg) {
uint64_t newload;
package = pkg->data;
newload = package->workload + package_cost_func(irq, package);
if (newload < best_cost) {
best = package;
best_cost = newload;
}
pkg = g_list_next(pkg);
}
if (best) {
best->workload += irq->workload + 1;
best->interrupts=g_list_append(best->interrupts, irq);
best->class_count[irq->class]++;
irq->mask = best->mask;
}
iter = g_list_next(iter);
}
}
static void place_affinity_hint(GList *list)
{
/* still need to balance best workload within the affinity_hint mask */
GList *iter;
struct interrupt *irq;
iter = g_list_first(list);
while (iter) {
irq = iter->data;
if (irq->balance_level == BALANCE_NONE) {
iter = g_list_next(iter);
continue;
}
if ((!cpus_empty(irq->node_mask)) &&
(!cpus_equal(irq->mask, irq->node_mask)) &&
(!__cpus_full(&irq->node_mask, num_possible_cpus()))) {
irq->old_mask = irq->mask;
irq->mask = irq->node_mask;
}
iter = g_list_next(iter);
}
}
static void do_unroutables(void)
{
struct package *package;
struct cache_domain *cache_domain;
struct cpu_core *cpu;
struct interrupt *irq;
GList *iter, *inter;
inter = g_list_first(interrupts);
while (inter) {
irq = inter->data;
inter = g_list_next(inter);
if (irq->balance_level != BALANCE_NONE)
continue;
iter = g_list_first(packages);
while (iter) {
package = iter->data;
if (cpus_intersects(package->mask, irq->node_mask) ||
cpus_intersects(package->mask, irq->mask))
package->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
if (cpus_intersects(cache_domain->mask, irq->node_mask)
|| cpus_intersects(cache_domain->mask, irq->mask))
cache_domain->workload += irq->workload;
iter = g_list_next(iter);
}
iter = g_list_first(cpus);
while (iter) {
cpu = iter->data;
if (cpus_intersects(cpu->mask, irq->node_mask) ||
cpus_intersects(cpu->mask, irq->mask))
cpu->workload += irq->workload;
iter = g_list_next(iter);
}
}
}
void calculate_placement(void)
{
struct package *package;
struct cache_domain *cache_domain;
GList *iter;
/* first clear old data */
clear_work_stats();
sort_irq_list();
do_unroutables();
place_packages(interrupts);
iter = g_list_first(packages);
while (iter) {
package = iter->data;
place_cache_domain(package);
iter = g_list_next(iter);
}
iter = g_list_first(cache_domains);
while (iter) {
cache_domain = iter->data;
place_core(cache_domain);
iter = g_list_next(iter);
}
/*
* if affinity_hint is populated on irq and is not set to
* all CPUs (meaning it's initialized), honor that above
* anything in the package locality/workload.
*/
place_affinity_hint(interrupts);
}