diff -ruN linux-2.4.20-8/arch/i386/kernel/entry.S fifth-cut-20040706/arch/i386/kernel/entry.S --- linux-2.4.20-8/arch/i386/kernel/entry.S 2003-03-13 14:24:26.000000000 -0800 +++ fifth-cut-20040706/arch/i386/kernel/entry.S 2004-05-21 16:17:20.000000000 -0700 @@ -673,6 +673,8 @@ .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_wait */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_remap_file_pages */ .long SYMBOL_NAME(sys_set_tid_address) + .long SYMBOL_NAME(sys_sched_setprocstate) + .long SYMBOL_NAME(sys_sched_getprocstate) /* 260 */ .rept NR_syscalls-(.-sys_call_table)/4 diff -ruN linux-2.4.20-8/arch/i386/kernel/setup.c fifth-cut-20040706/arch/i386/kernel/setup.c --- linux-2.4.20-8/arch/i386/kernel/setup.c 2003-03-13 14:24:26.000000000 -0800 +++ fifth-cut-20040706/arch/i386/kernel/setup.c 2004-05-21 10:57:11.000000000 -0700 @@ -2964,6 +2964,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; + static char *procstate[] = { + "enabled", "restricted", + }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; int fpu_exception; @@ -3027,9 +3030,13 @@ x86_cap_flags[i] != NULL ) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + + seq_printf(m, "procstate\t: %s\n", procstate[get_procstate(n)]); + seq_printf(m, "\n"); + return 0; } diff -ruN linux-2.4.20-8/fs/proc/array.c fifth-cut-20040706/fs/proc/array.c --- linux-2.4.20-8/fs/proc/array.c 2003-03-13 14:24:29.000000000 -0800 +++ fifth-cut-20040706/fs/proc/array.c 2004-05-21 11:56:06.000000000 -0700 @@ -125,7 +125,8 @@ "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "Z (zombie)", /* 8 */ - "X (dead)" /* 16 */ + "X (dead)", /* 16 */ + "U (unrunnable)" /* 32 */ }; static inline const char * get_task_state(struct task_struct *tsk) @@ -134,7 +135,8 @@ TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_ZOMBIE | - TASK_STOPPED); + TASK_STOPPED | + TASK_UNRUNNABLE); const char **p = &task_state_array[0]; while (state) { @@ -273,6 +275,16 @@ cap_t(p->cap_effective)); } +static inline char *task_curcpu(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "CPU:\t%d\n", task_cpu(p)); +} + +static inline char *task_affinity(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "Affin:\t%08lx\n", p->cpus_allowed); +} + int proc_pid_status(struct task_struct *task, char * buffer) { @@ -292,6 +304,8 @@ } buffer = task_sig(task, buffer); buffer = task_cap(task, buffer); + buffer = task_curcpu(task, buffer); + buffer = task_affinity(task, buffer); #if defined(CONFIG_ARCH_S390) buffer = task_show_regs(task, buffer); #endif diff -ruN linux-2.4.20-8/include/asm-i386/unistd.h fifth-cut-20040706/include/asm-i386/unistd.h --- linux-2.4.20-8/include/asm-i386/unistd.h 2003-03-13 14:24:21.000000000 -0800 +++ fifth-cut-20040706/include/asm-i386/unistd.h 2004-05-21 09:08:39.000000000 -0700 @@ -259,6 +259,8 @@ #define __NR_exit_group 252 #define __NR_lookup_dcookie 253 #define __NR_set_tid_address 258 +#define __NR_sched_setprocstate 259 +#define __NR_sched_getprocstate 260 /* user-visible error numbers are in the range -1 - -124: see */ diff -ruN linux-2.4.20-8/include/linux/sched.h fifth-cut-20040706/include/linux/sched.h --- linux-2.4.20-8/include/linux/sched.h 2003-03-13 14:32:17.000000000 -0800 +++ fifth-cut-20040706/include/linux/sched.h 2004-05-21 13:18:48.000000000 -0700 @@ -109,6 +109,7 @@ #define TASK_STOPPED 4 #define TASK_ZOMBIE 8 #define TASK_DEAD 16 +#define TASK_UNRUNNABLE 32 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -143,6 +144,15 @@ struct completion; +/* + * states for sched_{get,set}procstate - the numbering of these is used by + * /proc/cpuinfo code. + */ +#define PROC_ENABLED 0 +#define PROC_RESTRICTED 1 +#define PROCSTATE_MIN PROC_ENABLED +#define PROCSTATE_MAX PROC_RESTRICTED + #ifdef __KERNEL__ #include @@ -568,8 +578,10 @@ #if CONFIG_SMP extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +extern int get_procstate(int cpu); #else #define set_cpus_allowed(p, new_mask) do { } while (0) +#define get_procstate(cpu) (PROC_ENABLED) #endif extern void set_user_nice(task_t *p, long nice); diff -ruN linux-2.4.20-8/include/linux/sys.h fifth-cut-20040706/include/linux/sys.h --- linux-2.4.20-8/include/linux/sys.h 2003-03-13 14:24:21.000000000 -0800 +++ fifth-cut-20040706/include/linux/sys.h 2004-05-21 16:23:18.000000000 -0700 @@ -4,7 +4,7 @@ /* * system call entry points ... but not all are defined */ -#define NR_syscalls 260 +#define NR_syscalls 262 /* * These are system calls that will be removed at some time diff -ruN linux-2.4.20-8/kernel/sched.c fifth-cut-20040706/kernel/sched.c --- linux-2.4.20-8/kernel/sched.c 2003-03-13 14:24:20.000000000 -0800 +++ fifth-cut-20040706/kernel/sched.c 2004-06-29 19:28:45.000000000 -0700 @@ -1,3 +1,10 @@ +/* remove this block and all PRDBG() lines when done */ +#define PROCSTATE_DEBUG 1 +#define PRDBG(fmt, args...) do { \ + if (PROCSTATE_DEBUG) \ + printk(KERN_ERR "DBG: " fmt, ##args); \ +} while (0) + /* * kernel/sched.c * @@ -179,6 +186,13 @@ #endif /* + * Per-CPU processor state + */ +static spinlock_t procstate_lock = SPIN_LOCK_UNLOCKED; +static int procstate[NR_CPUS]; +static DECLARE_RWSEM(procstate_sem); + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -222,6 +236,96 @@ spin_unlock_irq(&rq->lock); } +static inline int __cpu_allowed_in_mask(int cpu, unsigned long cpus_allowed) +{ + unsigned long mask = 1UL << cpu; + + /* it may not be in the cpus_allowed */ + if (!(cpus_allowed & mask)) + return 0; + + if (likely(procstate[cpu] == PROC_ENABLED)) + return 1; + else if (procstate[cpu] == PROC_RESTRICTED + && cpus_allowed == mask) + return 1; + + return 0; +} + +static inline int __cpu_allowed(task_t *task, int cpu) +{ + return __cpu_allowed_in_mask(cpu, task->cpus_allowed); +} + +/* figure out if a task is eligible for a given CPU */ +static inline int cpu_allowed(task_t *task, int cpu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = __cpu_allowed(task, cpu); + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} + +/* find an eligible CPU or return -1 */ +static inline int __find_first_allowed(task_t *task) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (cpu_allowed(task, i)) + return i; + } + return -1; +} + +/* find an eligible CPU or don't change, but always return a valid CPU # */ +static inline int find_first_allowed(task_t *task) +{ + int i = __find_first_allowed(task); + if (i < 0) { + if (cpu_allowed(task, task->cpu)) + i = task->cpu; + else + i = __ffs(task->cpus_allowed); + } + return i; +} + +static inline int __runnable_mask(unsigned long allowed) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (__cpu_allowed_in_mask(i, allowed)) + return 1; + } + return 0; +} + +#if CONFIG_SMP +static spinlock_t unrunnable_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(unrunnable_tasks); + +/* + * Put a task on the unrunnable list. This is called with the + * task_rq locked and the unrunnable_lock held. + */ +static void __unrunnable_task(task_t *task) +{ + list_add_tail(&task->run_list, &unrunnable_tasks); + task->state = TASK_UNRUNNABLE; + task->array = NULL; +#if 0 /* some other bug does not want printk here! */ + PRDBG("task %d (%16s) seems to be unrunnable\n", task->pid, task->comm); +#endif +} +#else +#define __unrunnable_task(t) +#endif + /* * Adding/removing a task to/from a priority array: */ @@ -278,6 +382,27 @@ */ static inline void __activate_task(task_t *p, runqueue_t *rq) { + unsigned long flags; + + /* + * Check here if a task is unrunnable. Unrunnable tasks can sit + * on waitqueues or wherever, but they can't go onto a runqueue. + * Make sure we don't race with anyone changing the procstate and + * making us a liar. When the task becomes runnable again, we'll + * activate it again. Get the unrunnable_lock before the + * procstate_lock to avoid deadlocks. + */ + spin_lock_irqsave(&unrunnable_lock, flags); + spin_lock(&procstate_lock); + if (unlikely(!__cpu_allowed(p, p->cpu))) { + __unrunnable_task(p); + spin_unlock(&procstate_lock); + spin_unlock_irqrestore(&unrunnable_lock, flags); + return; + } + spin_unlock(&procstate_lock); + spin_unlock_irqrestore(&unrunnable_lock, flags); + enqueue_task(p, rq->active); rq->nr_running++; } @@ -418,7 +543,7 @@ */ if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && - (p->cpus_allowed & (1UL << smp_processor_id())))) { + (cpu_allowed(p, smp_processor_id())))) { set_task_cpu(p, smp_processor_id()); task_rq_unlock(rq, &flags); @@ -435,9 +560,11 @@ } success = 1; } - if (p->state >= TASK_ZOMBIE) - BUG(); - p->state = TASK_RUNNING; + if (p->state != TASK_UNRUNNABLE) { + if (p->state >= TASK_ZOMBIE) + BUG(); + p->state = TASK_RUNNING; + } } task_rq_unlock(rq, &flags); @@ -836,8 +969,7 @@ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ ((jiffies - (p)->last_run > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) + !task_running(rq, p) && (cpu_allowed((p), (this_cpu)))) curr = curr->prev; @@ -978,6 +1112,180 @@ spin_unlock(&rq->lock); } +#if CONFIG_SMP +/* + * Walk through the list of unrunnable tasks and activate any that are + * no longer unrunnable. This is called with no locks. + */ +static void check_all_unrunnables(void) +{ + struct list_head *tmp, *tmp2; + LIST_HEAD(now_runnable); + unsigned long flags; + + /* + * Build up a (local) list of now-runnable tasks, then drop the + * unrunnable lock. The act of trying to activate these tasks may + * want to put them on the unrunnable list if something has changed + * underneath us. Also, there is an AB-BA deadlock chance with + * other CPUs (unrunnable_lock vs. rq->lock => always get rq locks + * first). + */ + PRDBG("checking for unrunnables that may be runnable\n"); + spin_lock_irqsave(&unrunnable_lock, flags); + list_for_each_safe(tmp, tmp2, &unrunnable_tasks) { + task_t *task; + int cpu; + + task = list_entry(tmp, task_t, run_list); + //PRDBG("task %d (%16s) is on the unrunnable list\n", + // task->pid, task->comm); + cpu = __find_first_allowed(task); + if (cpu >= 0) { + list_del(&task->run_list); + /* safe, no one else can touch this task */ + task->cpu = cpu; + list_add_tail(&task->run_list, &now_runnable); + } + } + spin_unlock_irqrestore(&unrunnable_lock, flags); + + /* + * Now try to activate the list of tasks we suspect are runnable. + */ + if (!list_empty(&now_runnable)) + PRDBG("activating possible runnables\n"); + list_for_each_safe(tmp, tmp2, &now_runnable) { + task_t *task; + + task = list_entry(tmp, task_t, run_list); + list_del(&task->run_list); + + PRDBG("task %d (%16s) is now runnable on %d\n", + task->pid, task->comm, task->cpu); + try_to_wake_up(task, TASK_UNRUNNABLE, 0); + } + PRDBG("done with unrunnables\n"); +} + +static void check_if_unrunnable(task_t *task) +{ + if (task->state == TASK_UNRUNNABLE) { + int cpu; + + cpu = __find_first_allowed(task); + if (cpu >= 0) { + unsigned long flags; + runqueue_t *rq; + + PRDBG("task %d (%16s) is runnable on %d\n", + task->pid, task->comm, task->cpu); + spin_lock_irqsave(&unrunnable_lock, flags); + list_del(&task->run_list); + spin_unlock_irqrestore(&unrunnable_lock, flags); + + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + task->cpu = cpu; + task->state = TASK_RUNNING; + __activate_task(task, rq); + spin_unlock_irqrestore(&rq->lock, flags); + } + } +} + +static void move_displaced_tasks(void) +{ + unsigned long flags; + + PRDBG("scanning tasklist for invalids\n"); + + /* + * We have to do current before anything else, so that the + * migration thread has something to wake up. Otherwise we + * get stuck. + */ + if (!cpu_allowed(current, current->cpu)) { + PRDBG("migrating current task first\n"); + set_cpus_allowed(current, current->cpus_allowed); + } + + while (1) { + task_t *g, *p; + task_t *task = NULL; + + /* + * This weird approach is because we have to drop the + * tasklist lock to call set_cpus_allowed. + */ + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + /* + * Look at any task that is not allowed on it's + * current CPU. Otherwise we have races where a + * sleeping task is unrunnable at all, p->cpu = x, + * then CPU y comes online, and would make p runnable + * again. When p wakes up, activate_task() can + * not re-assign p->cpu, so p ends up unrunnable. + * + * Tasks which are asleep and have no other eligible + * CPU are left as they are, for now. + */ + if (!cpu_allowed(p, p->cpu) + && (p->array || __find_first_allowed(p) != -1)) { + task = p; + get_task_struct(task); + goto found_one; + } + } while_each_thread(g, p); +found_one: + read_unlock_irqrestore(&tasklist_lock, flags); + if (!task) + break; + + /* + * Once we have a task that is not eligible on it's current + * cpu, we let set_cpus_allowed() do it's thing. Running + * tasks will be migrated off to another CPU. Sleeping + * tasks will have their ->cpu changed. Unrunnable tasks + * that were running will be caught in __activate_task() + * from the migration thread. Unrunnable tasks that were + * sleeping will be caught by __activate_task() when they + * wake up. + * + * set_cpus_allowed() can sleep - no locks allowed + */ + PRDBG("invalid %d (%16s) ->cpu was %d\n", + task->pid, task->comm, task->cpu); + set_cpus_allowed(task, task->cpus_allowed); + PRDBG("invalid %d (%16s) ->cpu is %d\n", + task->pid, task->comm, task->cpu); + + /* decrement use counter */ + put_task_struct(task); + } + PRDBG("done scanning\n"); +} + +static void procstate_changed(int cpu, int oldstate, int newstate) +{ + PRDBG("changing CPU %d procstate from %d to %d\n", + cpu, oldstate, newstate); + + /* if it's more restricted, some tasks may be disallowed */ + if (newstate > oldstate) + move_displaced_tasks(); + + /* + * Always check_all_unrunnables() - there is a race between any task + * waking up and us walking the task list in move_displaced_tasks(). + */ + check_all_unrunnables(); +} +#else +#define procstate_changed(c, o, n) +#endif + void scheduling_functions_start_here(void) { } /* @@ -1552,7 +1860,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned long new_mask; @@ -1565,8 +1873,7 @@ if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - new_mask &= cpu_online_map; - if (!new_mask) + if (!(new_mask & cpu_online_map)) return -EINVAL; read_lock(&tasklist_lock); @@ -1590,8 +1897,14 @@ !capable(CAP_SYS_NICE)) goto out_unlock; + retval = -EINVAL; + if (!__runnable_mask(new_mask)) + goto out_unlock; + retval = 0; + down_read(&procstate_sem); set_cpus_allowed(p, new_mask); + up_read(&procstate_sem); out_unlock: put_task_struct(p); @@ -1604,7 +1917,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned int real_len; @@ -1624,7 +1937,7 @@ goto out_unlock; retval = 0; - mask = p->cpus_allowed & cpu_online_map; + mask = p->cpus_allowed; out_unlock: read_unlock(&tasklist_lock); @@ -1635,6 +1948,69 @@ return real_len; } +asmlinkage long sys_sched_setprocstate(int cpu, int state) +{ + int oldstate; + unsigned long flags; + int ret = 0; + + PRDBG("sys_sched_setprocstate(%d, %d)\n", cpu, state); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL< PROCSTATE_MAX) + return -EINVAL; + + /* only one process monkeying in here, please */ + down_write(&procstate_sem); + + spin_lock_irqsave(&procstate_lock, flags); + + /* must have at least 1 ENABLED cpu */ + if (state != PROC_ENABLED) { + int i; + int count = 0; + + for (i = 0; i < smp_num_cpus; i++) + count += (procstate[i] == PROC_ENABLED); + + if (count == 1) { + spin_unlock_irqrestore(&procstate_lock, flags); + ret = -EBUSY; + goto out; + } + } + + oldstate = procstate[cpu]; + procstate[cpu] = state; + + spin_unlock_irqrestore(&procstate_lock, flags); + + if (oldstate != state) + procstate_changed(cpu, oldstate, state); +out: + up_write(&procstate_sem); + return ret; +} + +asmlinkage long sys_sched_getprocstate(int cpu, int *state_ptr) +{ + int state; + + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL<cpus_allowed = new_mask; /* * Can the task run on the task's current CPU? If not then * migrate the thread off to a proper CPU. */ - if (new_mask & (1UL << task_cpu(p))) { + if (cpu_allowed(p, p->cpu)) { task_rq_unlock(rq, &flags); - return; + goto out; } /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, __ffs(p->cpus_allowed)); + set_task_cpu(p, find_first_allowed(p)); task_rq_unlock(rq, &flags); - return; + goto out; } init_completion(&req.done); req.task = p; @@ -1991,6 +2367,8 @@ wake_up_process(rq->migration_thread); wait_for_completion(&req.done); +out: + check_if_unrunnable(p); } /* @@ -2042,7 +2420,7 @@ spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + cpu_dest = find_first_allowed(p); rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = task_cpu(p); @@ -2097,6 +2477,18 @@ return 0; } +int get_procstate(int cpu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = procstate[cpu]; + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} + #endif @@ -2119,6 +2511,7 @@ spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); + procstate[i] = PROC_ENABLED; for (j = 0; j < 2; j++) { array = rq->arrays + j;