Index: procstate-kernel-2.4/arch/i386/kernel/entry.S diff -u procstate-kernel-2.4/arch/i386/kernel/entry.S:1.1.1.1 procstate-kernel-2.4/arch/i386/kernel/entry.S:1.2 --- procstate-kernel-2.4/arch/i386/kernel/entry.S:1.1.1.1 Tue Jul 29 17:05:00 2003 +++ procstate-kernel-2.4/arch/i386/kernel/entry.S Tue Jul 29 17:17:55 2003 @@ -649,6 +649,8 @@ .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ .long SYMBOL_NAME(sys_sched_setaffinity) .long SYMBOL_NAME(sys_sched_getaffinity) + .long SYMBOL_NAME(sys_sched_setprocstate) + .long SYMBOL_NAME(sys_sched_getprocstate) .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) Index: procstate-kernel-2.4/arch/i386/kernel/setup.c diff -u procstate-kernel-2.4/arch/i386/kernel/setup.c:1.1.1.1 procstate-kernel-2.4/arch/i386/kernel/setup.c:1.2 --- procstate-kernel-2.4/arch/i386/kernel/setup.c:1.1.1.1 Tue Jul 29 17:05:00 2003 +++ procstate-kernel-2.4/arch/i386/kernel/setup.c Tue Jul 29 17:17:55 2003 @@ -2884,6 +2884,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; + static char *procstate[] = { + "enabled", "restricted", "isolated" + }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; int fpu_exception; @@ -2947,9 +2950,13 @@ x86_cap_flags[i] != NULL ) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + + seq_printf(m, "procstate\t: %s\n", procstate[get_procstate(n)]); + seq_printf(m, "\n"); + return 0; } Index: procstate-kernel-2.4/fs/proc/array.c diff -u procstate-kernel-2.4/fs/proc/array.c:1.1.1.1 procstate-kernel-2.4/fs/proc/array.c:1.4 --- procstate-kernel-2.4/fs/proc/array.c:1.1.1.1 Tue Jul 29 17:06:45 2003 +++ procstate-kernel-2.4/fs/proc/array.c Wed Sep 17 16:12:57 2003 @@ -125,7 +125,8 @@ "D (disk sleep)", /* 2 */ "Z (zombie)", /* 4 */ "T (stopped)", /* 8 */ - "W (paging)" /* 16 */ + "W (paging)", /* 16 */ + "U (unrunnable)" /* 32 */ }; static inline const char * get_task_state(struct task_struct *tsk) @@ -134,7 +135,8 @@ TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_ZOMBIE | - TASK_STOPPED); + TASK_STOPPED | + TASK_UNRUNNABLE); const char **p = &task_state_array[0]; while (state) { @@ -272,6 +274,17 @@ cap_t(p->cap_effective)); } +static inline char *task_cpu(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "CPU:\t%d\n", p->cpu); +} + +static inline char *task_affinity(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "Affin:\t%08lx\n" + "AffMsk:\t%08lx\n", + p->cpus_allowed, p->cpus_allowed_mask); +} int proc_pid_status(struct task_struct *task, char * buffer) { @@ -291,6 +304,8 @@ } buffer = task_sig(task, buffer); buffer = task_cap(task, buffer); + buffer = task_cpu(task, buffer); + buffer = task_affinity(task, buffer); #if defined(CONFIG_ARCH_S390) buffer = task_show_regs(task, buffer); #endif Index: procstate-kernel-2.4/include/asm-i386/unistd.h diff -u procstate-kernel-2.4/include/asm-i386/unistd.h:1.1.1.1 procstate-kernel-2.4/include/asm-i386/unistd.h:1.2 --- procstate-kernel-2.4/include/asm-i386/unistd.h:1.1.1.1 Tue Jul 29 17:06:50 2003 +++ procstate-kernel-2.4/include/asm-i386/unistd.h Tue Jul 29 17:18:06 2003 @@ -247,6 +247,8 @@ #define __NR_futex 240 #define __NR_sched_setaffinity 241 #define __NR_sched_getaffinity 242 +#define __NR_sched_setprocstate 243 +#define __NR_sched_getprocstate 244 /* user-visible error numbers are in the range -1 - -124: see */ Index: procstate-kernel-2.4/include/linux/sched.h diff -u procstate-kernel-2.4/include/linux/sched.h:1.1.1.1 procstate-kernel-2.4/include/linux/sched.h:1.4 --- procstate-kernel-2.4/include/linux/sched.h:1.1.1.1 Tue Jul 29 17:06:57 2003 +++ procstate-kernel-2.4/include/linux/sched.h Wed Sep 17 16:13:04 2003 @@ -92,6 +92,8 @@ #define TASK_UNINTERRUPTIBLE 2 #define TASK_ZOMBIE 4 #define TASK_STOPPED 8 +/* /proc printing code thinks something is at 16, better to just skip it */ +#define TASK_UNRUNNABLE 32 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -126,6 +128,14 @@ struct completion; +/* + * states for sched_{get,set}procstate - the numbering of these is + * used by /proc/cpuinfo + */ +#define PROC_ENABLED 0 +#define PROC_RESTRICTED 1 +#define PROC_ISOLATED 2 + #ifdef __KERNEL__ #include @@ -496,6 +506,7 @@ #else #define set_cpus_allowed(p, new_mask) do { } while (0) #endif +extern int get_procstate(int cpu); extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); Index: procstate-kernel-2.4/kernel/sched.c diff -u procstate-kernel-2.4/kernel/sched.c:1.1.1.1 procstate-kernel-2.4/kernel/sched.c:1.4 --- procstate-kernel-2.4/kernel/sched.c:1.1.1.1 Tue Jul 29 17:07:05 2003 +++ procstate-kernel-2.4/kernel/sched.c Wed Sep 17 16:13:07 2003 @@ -1,3 +1,10 @@ +/* remove this block and all PRDBG() lines when done */ +#define PROCSTATE_DEBUG 1 +#define PRDBG(fmt, args...) do { \ + if (PROCSTATE_DEBUG) \ + printk(KERN_ERR "DBG: " fmt, ##args); \ +} while (0) + /* * kernel/sched.c * @@ -163,6 +170,12 @@ #endif /* + * Per-CPU processor state + */ +static spinlock_t procstate_lock = SPIN_LOCK_UNLOCKED; +static int procstate[NR_CPUS]; + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -207,7 +220,99 @@ local_irq_enable(); } +static inline int __cpu_allowed_in_masks(int cpu, unsigned long cpus_allowed, + unsigned long cpus_allowed_mask) +{ + unsigned long mask = 1UL << cpu; + + /* it may not be in the cpus_allowed */ + if (!(cpus_allowed & mask)) + return 0; + + if (likely(procstate[cpu] == PROC_ENABLED)) + return 1; + else if (procstate[cpu] == PROC_RESTRICTED + && cpus_allowed == mask) + return 1; + else if (procstate[cpu] == PROC_ISOLATED + && cpus_allowed == mask && cpus_allowed_mask == mask) + return 1; + + return 0; +} + +static inline int __cpu_allowed(task_t *task, int cpu) +{ + return __cpu_allowed_in_masks(cpu, task->cpus_allowed, + task->cpus_allowed_mask); +} + +/* figure out if a task is eligible for a given CPU */ +static inline int cpu_allowed(task_t *task, int cpu) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = __cpu_allowed(task, cpu); + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} + +/* find an eligible CPU or return -1 */ +static inline int __find_first_allowed(task_t *task) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (cpu_allowed(task, i)) + return i; + } + return -1; +} + +/* find an eligible CPU or don't change, but always return a valid CPU # */ +static inline int find_first_allowed(task_t *task) +{ + int i = __find_first_allowed(task); + if (i < 0) { + if (cpu_allowed(task, task->cpu)) + i = task->cpu; + else + i = __ffs(task->cpus_allowed); + } + return i; +} + +static inline int __runnable_masks(unsigned long allowed, + unsigned long allowed_mask) +{ + int i; + for (i = 0; i < smp_num_cpus; i++) { + if (__cpu_allowed_in_masks(i, allowed, allowed_mask)) + return 1; + } + return 0; +} + +#if CONFIG_SMP +static spinlock_t unrunnable_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(unrunnable_tasks); + /* + * Put a task on the unrunnable list. This is called with the + * task_rq locked and the unrunnable_lock held. + */ +static void __unrunnable_task(task_t *task) +{ + list_add_tail(&task->run_list, &unrunnable_tasks); + task->state = TASK_UNRUNNABLE; +} +#else +#define __unrunnable_task(t); +#endif + +/* * Adding/removing a task to/from a priority array: */ static inline void dequeue_task(struct task_struct *p, prio_array_t *array) @@ -256,7 +361,27 @@ { unsigned long sleep_time = jiffies - p->sleep_timestamp; prio_array_t *array = rq->active; + unsigned long flags; + /* + * Check here if a task is unrunnable. Unrunnable tasks can sit + * on waitqueues or wherever, but they can't go onto a runqueue. + * Make sure we don't race with anyone changing the procstate and + * making us a liar. When the task becomes runnable again, we'll + * activate it again. Get the unrunnable_lock before the + * procstate_lock to avoid deadlocks. + */ + spin_lock_irqsave(&unrunnable_lock, flags); + spin_lock(&procstate_lock); + if (unlikely(!__cpu_allowed(p, p->cpu))) { + __unrunnable_task(p); + spin_unlock(&procstate_lock); + spin_unlock_irqrestore(&unrunnable_lock, flags); + return; + } + spin_unlock(&procstate_lock); + spin_unlock_irqrestore(&unrunnable_lock, flags); + if (!rt_task(p) && sleep_time) { /* * This code gives a bonus to interactive tasks. We update @@ -364,7 +489,7 @@ */ if (unlikely(sync && (rq->curr != p) && (p->cpu != smp_processor_id()) && - (p->cpus_allowed & (1UL << smp_processor_id())))) { + (cpu_allowed(p, smp_processor_id())))) { p->cpu = smp_processor_id(); task_rq_unlock(rq, &flags); @@ -380,7 +505,8 @@ resched_task(rq->curr); success = 1; } - p->state = TASK_RUNNING; + if (p->state != TASK_UNRUNNABLE) + p->state = TASK_RUNNING; task_rq_unlock(rq, &flags); return success; @@ -644,8 +770,7 @@ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ - ((p) != (rq)->curr) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) + ((p) != (rq)->curr) && (cpu_allowed((p), (this_cpu)))) curr = curr->prev; @@ -792,6 +917,166 @@ spin_unlock(&rq->lock); } +#if CONFIG_SMP +/* + * Walk through the list of unrunnable tasks and activate any that are + * no longer unrunnable. This is called with no locks. + */ +static void check_all_unrunnables(void) +{ + struct list_head *tmp, *tmp2; + LIST_HEAD(now_runnable); + unsigned long flags; + + /* + * Build up a (local) list of now-runnable tasks, then drop the + * unrunnable lock. The act of trying to activate these tasks may + * want to put them on the unrunnable list if something has changed + * underneath us. Also, there is an AB-BA deadlock chance with + * other CPUs (unrunnable_lock vs. rq->lock => always get rq locks + * first). + */ + PRDBG("checking for unrunnables that may be runnable\n"); + spin_lock_irqsave(&unrunnable_lock, flags); + list_for_each_safe(tmp, tmp2, &unrunnable_tasks) { + task_t *task; + int cpu; + + task = list_entry(tmp, task_t, run_list); + cpu = __find_first_allowed(task); + if (cpu >= 0) { + list_del(&task->run_list); + /* safe, no one else can touch this task */ + task->cpu = cpu; + list_add_tail(&task->run_list, &now_runnable); + } + } + spin_unlock_irqrestore(&unrunnable_lock, flags); + + /* + * Now try to activate the list of tasks we suspect are runnable. + */ + PRDBG("activating possible runnables\n"); + list_for_each_safe(tmp, tmp2, &now_runnable) { + task_t *task; + runqueue_t *rq; + + task = list_entry(tmp, task_t, run_list); + list_del(&task->run_list); + + PRDBG("task %d (%16s) is now runnable on %d\n", + task->pid, task->comm, task->cpu); + rq = task_rq(task); + spin_lock_irqsave(&rq->lock, flags); + task->state = TASK_RUNNING; + activate_task(task, rq); + spin_unlock_irqrestore(&rq->lock, flags); + } + PRDBG("done with unrunnables\n"); +} + +static void check_if_unrunnable(task_t *task) +{ + if (task->state == TASK_UNRUNNABLE) { + int cpu; + + cpu = __find_first_allowed(task); + if (cpu >= 0) { + unsigned long flags; + runqueue_t *rq; + + PRDBG("task %d (%16s) is runnable on %d\n", + task->pid, task->comm, task->cpu); + spin_lock_irqsave(&unrunnable_lock, flags); + list_del(&task->run_list); + spin_unlock_irqrestore(&unrunnable_lock, flags); + + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + task->cpu = cpu; + task->state = TASK_RUNNING; + activate_task(task, rq); + spin_unlock_irqrestore(&rq->lock, flags); + } + } +} + +static void move_displaced_tasks(void) +{ + PRDBG("scanning tasklist for invalids\n"); + while (1) { + task_t *p; + task_t *task = NULL;; + + /* + * This weird approach is because we have to drop the + * tasklist lock to call set_cpus_allowed. + */ + read_lock_irq(&tasklist_lock); + for_each_task(p) { + /* + * Look at any task that is not allowed on it's + * current CPU. Otherwise we have races where a + * sleeping task is unrunnable at all, p->cpu = x, + * then CPU y comes online, and would make p runnable + * again. When p wakes up, activate_task() can + * not re-assign p->cpu, so p ends up unrunnable. + * + * Tasks which are asleep and have no other eligible + * CPU are left as is here. + */ + if (!cpu_allowed(p, p->cpu) + && (p->array || __find_first_allowed(p) != -1)) { + task = p; + get_task_struct(task); + break; + } + } + read_unlock_irq(&tasklist_lock); + if (!task) + break; + + /* + * Once we have a task that is not eligible on it's current + * cpu, we let set_cpus_allowed() do it's thing. Running + * tasks will be migrated off to another CPU. Sleeping + * tasks will have their ->cpu changed. Unrunnable tasks + * that were running will be caught in activate_task() from + * set_cpus_allowed(). Unrunnable tasks that were sleeping + * will be caught when they wake up by activate_task(). + * + * set_cpus_allowed() can sleep - no locks allowed + */ + PRDBG("invalid %d (%16s) ->cpu was %d\n", + task->pid, task->comm, task->cpu); + set_cpus_allowed(task, task->cpus_allowed); + PRDBG("invalid %d (%16s) ->cpu is %d\n", + task->pid, task->comm, task->cpu); + + /* decrement use counter */ + free_task_struct(task); + } + PRDBG("done scanning\n"); +} + +static void procstate_changed(int cpu, int oldstate, int newstate) +{ + PRDBG("changing CPU %d from %d to %d\n", + cpu, oldstate, newstate); + if (newstate > oldstate) { + /* more restricted - some tasks may be disallowed */ + move_displaced_tasks(); + } + /* + * Always check_all_unrunnables() - there is a race between any task + * waking up and us walking the task list in move_displaced_tasks(). + */ + check_all_unrunnables(); +} +#else +#define procstate_changed(c, o, n) +#endif + void scheduling_functions_start_here(void) { } /* @@ -1277,7 +1562,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned long new_mask; @@ -1290,8 +1575,7 @@ if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - new_mask &= cpu_online_map; - if (!new_mask) + if (!(new_mask & cpu_online_map)) return -EINVAL; read_lock(&tasklist_lock); @@ -1310,13 +1594,12 @@ get_task_struct(p); read_unlock(&tasklist_lock); - if (!capable(CAP_SYS_NICE)) + if (!capable(CAP_SYS_NICE)) { new_mask &= p->cpus_allowed_mask; - if (capable(CAP_SYS_NICE)) - p->cpus_allowed_mask |= new_mask; - if (!new_mask) { - retval = -EINVAL; - goto out_unlock; + if (!new_mask) { + retval = -EINVAL; + goto out_unlock; + } } retval = -EPERM; @@ -1324,6 +1607,14 @@ !capable(CAP_SYS_NICE)) goto out_unlock; + retval = -EINVAL; + if (!__runnable_masks(new_mask, + capable(CAP_SYS_NICE) ? new_mask : p->cpus_allowed_mask)) + goto out_unlock; + + if (capable(CAP_SYS_NICE)) + p->cpus_allowed_mask = new_mask; + retval = 0; set_cpus_allowed(p, new_mask); @@ -1338,7 +1629,7 @@ * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned int real_len; @@ -1358,7 +1649,7 @@ goto out_unlock; retval = 0; - mask = p->cpus_allowed & cpu_online_map; + mask = p->cpus_allowed; out_unlock: read_unlock(&tasklist_lock); @@ -1369,6 +1660,58 @@ return real_len; } +asmlinkage long sys_sched_setprocstate(int cpu, int state) +{ + int oldstate; + unsigned long flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL< PROC_ISOLATED) + return -EINVAL; + + spin_lock_irqsave(&procstate_lock, flags); + + /* must have at least 1 ENABLED cpu */ + if (state != PROC_ENABLED) { + int i; + int count = 0; + + for (i = 0; i < smp_num_cpus; i++) + count += (procstate[i] == PROC_ENABLED); + + if (count == 1) { + spin_unlock_irqrestore(&procstate_lock, flags); + return -EBUSY; + } + } + + oldstate = procstate[cpu]; + procstate[cpu] = state; + + spin_unlock_irqrestore(&procstate_lock, flags); + + if (oldstate != state) + procstate_changed(cpu, oldstate, state); + + return 0; +} + +asmlinkage long sys_sched_getprocstate(int cpu, int *state_ptr) +{ + int state; + + if (cpu >= smp_num_cpus || cpu < 0 || + ((1UL<expired = rq->arrays + 1; spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); + procstate[i] = PROC_ENABLED; for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -1696,17 +2040,13 @@ migration_req_t req; runqueue_t *rq; - new_mask &= cpu_online_map; - if (!new_mask) - BUG(); - rq = task_rq_lock(p, &flags); p->cpus_allowed = new_mask; /* * Can the task run on the task's current CPU? If not then * migrate the process off to a proper CPU. */ - if (new_mask & (1UL << p->cpu)) { + if (cpu_allowed(p, p->cpu)) { task_rq_unlock(rq, &flags); goto out; } @@ -1715,7 +2055,7 @@ * it is sufficient to simply update the task's cpu field. */ if (!p->array && (p != rq->curr)) { - p->cpu = __ffs(p->cpus_allowed); + p->cpu = find_first_allowed(p); task_rq_unlock(rq, &flags); goto out; } @@ -1727,6 +2067,7 @@ down(&req.sem); out: + check_if_unrunnable(p); } static int migration_thread(void * bind_cpu) @@ -1746,6 +2087,7 @@ if (cpu != 0) { while (!cpu_rq(cpu_logical_map(0))->migration_thread) yield(); + current->cpus_allowed_mask = 1UL << cpu; set_cpus_allowed(current, 1UL << cpu); } printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id()); @@ -1777,7 +2119,7 @@ spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + cpu_dest = find_first_allowed(p); rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = p->cpu; @@ -1819,7 +2161,20 @@ while (!cpu_rq(cpu_logical_map(cpu))->migration_thread) schedule_timeout(2); } + #endif + +int get_procstate(int cpu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&procstate_lock, flags); + ret = procstate[cpu]; + spin_unlock_irqrestore(&procstate_lock, flags); + + return ret; +} #if LOWLATENCY_NEEDED #if LOWLATENCY_DEBUG Index: procstate-kernel-2.4/kernel/softirq.c diff -u procstate-kernel-2.4/kernel/softirq.c:1.1.1.1 procstate-kernel-2.4/kernel/softirq.c:1.2 --- procstate-kernel-2.4/kernel/softirq.c:1.1.1.1 Tue Jul 29 17:07:05 2003 +++ procstate-kernel-2.4/kernel/softirq.c Wed Sep 17 16:13:07 2003 @@ -369,6 +369,7 @@ sigfillset(¤t->blocked); /* Migrate to the right CPU */ + current->cpus_allowed_mask = 1UL << cpu; set_cpus_allowed(current, 1UL << cpu); if (cpu() != cpu) BUG();