diff -ruN linux-2.4.20-8/arch/i386/kernel/entry.S procstate-2.4.20-8/arch/i386/kernel/entry.S --- linux-2.4.20-8/arch/i386/kernel/entry.S 2003-03-13 14:24:26.000000000 -0800 +++ procstate-2.4.20-8/arch/i386/kernel/entry.S 2004-05-21 16:17:20.000000000 -0700 @@ -673,6 +673,8 @@ .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_wait */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_remap_file_pages */ .long SYMBOL_NAME(sys_set_tid_address) + .long SYMBOL_NAME(sys_sched_setprocstate) + .long SYMBOL_NAME(sys_sched_getprocstate) /* 260 */ .rept NR_syscalls-(.-sys_call_table)/4 diff -ruN linux-2.4.20-8/arch/i386/kernel/setup.c procstate-2.4.20-8/arch/i386/kernel/setup.c --- linux-2.4.20-8/arch/i386/kernel/setup.c 2003-03-13 14:24:26.000000000 -0800 +++ procstate-2.4.20-8/arch/i386/kernel/setup.c 2004-05-21 10:57:11.000000000 -0700 @@ -2964,6 +2964,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; + static char *procstate[] = { + "enabled", "restricted", + }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; int fpu_exception; @@ -3027,9 +3030,13 @@ x86_cap_flags[i] != NULL ) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + + seq_printf(m, "procstate\t: %s\n", procstate[get_procstate(n)]); + seq_printf(m, "\n"); + return 0; } diff -ruN linux-2.4.20-8/fs/proc/array.c procstate-2.4.20-8/fs/proc/array.c --- linux-2.4.20-8/fs/proc/array.c 2003-03-13 14:24:29.000000000 -0800 +++ procstate-2.4.20-8/fs/proc/array.c 2004-10-12 17:32:36.000000000 -0700 @@ -273,6 +273,16 @@ cap_t(p->cap_effective)); } +static inline char *task_curcpu(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "CPU:\t%d\n", task_cpu(p)); +} + +static inline char *task_affinity(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "Affin:\t%08lx\n", p->cpus_allowed); +} + int proc_pid_status(struct task_struct *task, char * buffer) { @@ -292,6 +302,8 @@ } buffer = task_sig(task, buffer); buffer = task_cap(task, buffer); + buffer = task_curcpu(task, buffer); + buffer = task_affinity(task, buffer); #if defined(CONFIG_ARCH_S390) buffer = task_show_regs(task, buffer); #endif diff -ruN linux-2.4.20-8/include/asm-i386/unistd.h procstate-2.4.20-8/include/asm-i386/unistd.h --- linux-2.4.20-8/include/asm-i386/unistd.h 2003-03-13 14:24:21.000000000 -0800 +++ procstate-2.4.20-8/include/asm-i386/unistd.h 2004-05-21 09:08:39.000000000 -0700 @@ -259,6 +259,8 @@ #define __NR_exit_group 252 #define __NR_lookup_dcookie 253 #define __NR_set_tid_address 258 +#define __NR_sched_setprocstate 259 +#define __NR_sched_getprocstate 260 /* user-visible error numbers are in the range -1 - -124: see */ diff -ruN linux-2.4.20-8/include/linux/sched.h procstate-2.4.20-8/include/linux/sched.h --- linux-2.4.20-8/include/linux/sched.h 2003-03-13 14:32:17.000000000 -0800 +++ procstate-2.4.20-8/include/linux/sched.h 2004-10-24 10:45:00.000000000 -0700 @@ -143,6 +143,15 @@ struct completion; +/* + * states for sched_{get,set}procstate - the numbering of these is used by + * /proc/cpuinfo code. + */ +#define PROC_ENABLED 0 +#define PROC_RESTRICTED 1 +#define PROCSTATE_MIN PROC_ENABLED +#define PROCSTATE_MAX PROC_RESTRICTED + #ifdef __KERNEL__ #include @@ -568,8 +577,10 @@ #if CONFIG_SMP extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +extern int get_procstate(int cpu); #else #define set_cpus_allowed(p, new_mask) do { } while (0) +#define get_procstate(cpu) (PROC_ENABLED) #endif extern void set_user_nice(task_t *p, long nice); diff -ruN linux-2.4.20-8/include/linux/sys.h procstate-2.4.20-8/include/linux/sys.h --- linux-2.4.20-8/include/linux/sys.h 2003-03-13 14:24:21.000000000 -0800 +++ procstate-2.4.20-8/include/linux/sys.h 2004-05-21 16:23:18.000000000 -0700 @@ -4,7 +4,7 @@ /* * system call entry points ... but not all are defined */ -#define NR_syscalls 260 +#define NR_syscalls 262 /* * These are system calls that will be removed at some time diff -ruN linux-2.4.20-8/kernel/sched.c procstate-2.4.20-8/kernel/sched.c --- linux-2.4.20-8/kernel/sched.c 2003-03-13 14:24:20.000000000 -0800 +++ procstate-2.4.20-8/kernel/sched.c 2004-10-24 15:54:00.000000000 -0700 @@ -1,3 +1,10 @@ +/* remove this block and all PRDBG() lines when done */ +#define PROCSTATE_DEBUG 1 +#define PRDBG(fmt, args...) do { \ + if (PROCSTATE_DEBUG) \ + printk(KERN_ERR "DBG: " fmt, ##args); \ +} while (0) + /* * kernel/sched.c * @@ -179,6 +186,12 @@ #endif /* + * Per-CPU processor state + */ +static spinlock_t procstate_lock = SPIN_LOCK_UNLOCKED; +static int procstate[NR_CPUS]; + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -222,6 +235,83 @@ spin_unlock_irq(&rq->lock); } +static inline int __cpu_allowed_in_mask(int cpu, unsigned long cpus_allowed) +{ + unsigned long mask = 1UL << cpu; + + /* it might not be in the cpus_allowed */ + if (!(cpus_allowed & mask)) + return 0; + + if (likely(procstate[cpu] == PROC_ENABLED)) + return 1; + else if (procstate[cpu] == PROC_RESTRICTED + && cpus_allowed == mask) + return 1; + + return 0; +} + +static inline int __cpu_allowed(task_t *task, int cpu) +{ + return __cpu_allowed_in_mask(cpu, task->cpus_allowed); +} + +/* figure out if a task is eligible for a given CPU */ +static inline int cpu_allowed(task_t *task, int cpu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = __cpu_allowed(task, cpu); + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} + +/* find an eligible CPU or return -1 */ +static inline int __find_first_allowed(task_t *task) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (__cpu_allowed(task, i)) + return i; + } + return -1; +} + +/* find an eligible CPU or don't change, but always return a valid CPU # */ +static inline int find_first_allowed(task_t *task) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + + i = __find_first_allowed(task); + if (i < 0) { + if (__cpu_allowed(task, task->cpu)) + i = task->cpu; + else + i = __ffs(task->cpus_allowed); + } + + spin_unlock_irqrestore(&procstate_lock, flags); + + return i; +} + +static inline int __runnable_mask(unsigned long allowed) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (__cpu_allowed_in_mask(i, allowed)) + return 1; + } + return 0; +} + /* * Adding/removing a task to/from a priority array: */ @@ -418,7 +508,7 @@ */ if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && - (p->cpus_allowed & (1UL << smp_processor_id())))) { + (cpu_allowed(p, smp_processor_id())))) { set_task_cpu(p, smp_processor_id()); task_rq_unlock(rq, &flags); @@ -836,8 +926,7 @@ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ ((jiffies - (p)->last_run > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) + !task_running(rq, p) && (cpu_allowed((p), (this_cpu)))) curr = curr->prev; @@ -978,6 +1067,95 @@ spin_unlock(&rq->lock); } +#if CONFIG_SMP + +static void procstate_changed(int cpu, int oldstate, int newstate) +{ + unsigned long flags; + + if (oldstate != newstate) + PRDBG("changing CPU %d procstate from %d to %d\n", + cpu, oldstate, newstate); + + /* if it is less restricted, we don't care */ + if (newstate <= oldstate) + return; + + /* if it's more restricted, some tasks may be disallowed */ + PRDBG("scanning tasklist for invalids\n"); + + /* + * We have to do current before anything else, so that the + * migration thread has something to wake up. Otherwise we + * can get stuck. + */ + if (!cpu_allowed(current, current->cpu)) { + PRDBG("migrating current task first\n"); + set_cpus_allowed(current, current->cpus_allowed); + } + + while (1) { + task_t *g, *p; + task_t *task = NULL; + + /* + * This weird approach is because we have to drop the + * tasklist lock to call set_cpus_allowed. + */ + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + spin_lock(&procstate_lock); + if (!__cpu_allowed(p, p->cpu)) { + if (unlikely(__find_first_allowed(p) < 0)) { + /* unrunnable task */ + PRDBG("sending SIGPWR to %d (%16s)\n", + p->pid, p->comm); + p->cpus_allowed = -1; + send_sig(SIGPWR, p, 0); + } else { + /* migrate it */ + PRDBG("need to migrate %d (%16s)\n", + p->pid, p->comm); + task = p; + } + } + spin_unlock(&procstate_lock); + + if (task) { + get_task_struct(task); + goto found_one; + } + } while_each_thread(g, p); + +found_one: + read_unlock_irqrestore(&tasklist_lock, flags); + + if (!task) + break; + + /* + * Once we have a task that is not eligible on it's current + * cpu, we let set_cpus_allowed() do it's thing. Running + * tasks will be migrated off to another CPU. + * + * set_cpus_allowed() can sleep - no locks allowed + */ + PRDBG("invalid %d (%16s) ->cpu was %d\n", + task->pid, task->comm, task->cpu); + set_cpus_allowed(task, task->cpus_allowed); + PRDBG("invalid %d (%16s) ->cpu is %d\n", + task->pid, task->comm, task->cpu); + + /* decrement use counter */ + put_task_struct(task); + } + PRDBG("done scanning\n"); + +} +#else +#define procstate_changed(c, o, n) +#endif + void scheduling_functions_start_here(void) { } /* @@ -1552,7 +1730,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned long new_mask; @@ -1565,8 +1743,7 @@ if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - new_mask &= cpu_online_map; - if (!new_mask) + if (!(new_mask & cpu_online_map)) return -EINVAL; read_lock(&tasklist_lock); @@ -1590,6 +1767,10 @@ !capable(CAP_SYS_NICE)) goto out_unlock; + retval = -EINVAL; + if (!__runnable_mask(new_mask)) + goto out_unlock; + retval = 0; set_cpus_allowed(p, new_mask); @@ -1604,7 +1785,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned int real_len; @@ -1624,7 +1805,7 @@ goto out_unlock; retval = 0; - mask = p->cpus_allowed & cpu_online_map; + mask = p->cpus_allowed; out_unlock: read_unlock(&tasklist_lock); @@ -1635,6 +1816,65 @@ return real_len; } +asmlinkage long sys_sched_setprocstate(int cpu, int state) +{ + int oldstate; + unsigned long flags; + int ret = 0; + + PRDBG("sys_sched_setprocstate(%d, %d)\n", cpu, state); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL< PROCSTATE_MAX) + return -EINVAL; + + spin_lock_irqsave(&procstate_lock, flags); + + /* must have at least 1 ENABLED cpu */ + if (state != PROC_ENABLED) { + int i; + int count = 0; + + for (i = 0; i < smp_num_cpus; i++) + count += (procstate[i] == PROC_ENABLED); + + if (count == 1) { + spin_unlock_irqrestore(&procstate_lock, flags); + ret = -EBUSY; + goto out; + } + } + + oldstate = procstate[cpu]; + procstate[cpu] = state; + + spin_unlock_irqrestore(&procstate_lock, flags); + + procstate_changed(cpu, oldstate, state); + +out: + return ret; +} + +asmlinkage long sys_sched_getprocstate(int cpu, int *state_ptr) +{ + int state; + + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL<cpus_allowed = new_mask; /* * Can the task run on the task's current CPU? If not then * migrate the thread off to a proper CPU. */ - if (new_mask & (1UL << task_cpu(p))) { + if (cpu_allowed(p, p->cpu)) { task_rq_unlock(rq, &flags); return; } @@ -1979,7 +2218,7 @@ * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, __ffs(p->cpus_allowed)); + set_task_cpu(p, find_first_allowed(p)); task_rq_unlock(rq, &flags); return; } @@ -2042,7 +2281,7 @@ spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + cpu_dest = find_first_allowed(p); rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = task_cpu(p); @@ -2097,6 +2336,18 @@ return 0; } +int get_procstate(int cpu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = procstate[cpu]; + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} + #endif @@ -2119,6 +2370,7 @@ spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); + procstate[i] = PROC_ENABLED; for (j = 0; j < 2; j++) { array = rq->arrays + j;