--- /Users/josh/Desktop/cs_capstone/reference/linux-2.6.8.1-unpatched/kernel/sched.c Sat Aug 14 05:55:59 2004 +++ /Users/josh/Desktop/cs_capstone/josh_capstone_work/sched_commented_2.6.8.1.c Sun Jan 2 03:24:40 2005 @@ -18,6 +18,24 @@ * 2004-04-02 Scheduler domains code by Nick Piggin */ +/* + * Additional comments by Josh Aas. + * Copyright (c)2004 Silicon Graphics, Inc. (SGI) + * + * Comments are situated above what they describe. + * + * Abbreviations: + * RT - real-time (as in a "real-time process") + * UP - uniprocessor + * + * Notes: + * - SMT means symmetric multithreading. This is not the same thing as + * SMP. An example of an SMT system is an Intel Pentium 4 Hyper-Threading (HT) + * enabled processor. Basically, a single SMT chip can run multiple threads, + * which has some interesting scheduler implications since the threads + * share certain physical CPU resources. + */ + #include #include #include @@ -44,6 +62,18 @@ #include +/* + * NUMA architectures have groups of CPUs (and memory) organized + * into nodes. These macros are for getting the CPU mask for + * a node that a CPU belongs to. + * + * If the kernel is compiled for a NUMA architecture, do a node lookup + * by getting a CPU's node and then getting the CPU mask/map for + * that node. If non-NUMA, there will only be one mask/map, so insert that. + * + * Note that these NUMA macros are not used. They should probably have been + * removed from this file. + */ #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else @@ -54,6 +84,25 @@ * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. + * + * PRIO values are the priority values that the Linux scheduler uses internally. + * Possible PRIO values for RT tasks are 0 through (MAX_RT_PRIO - 1), and possible PRIO + * values for non-RT tasks are MAX_RT_PRIO through (MAX_PRIO - 1). The lower a task's + * PRIO value, the higher its priority. With this setup, RT tasks will always have + * a higher priority than non-RT tasks. + * + * For non-RT tasks, in order to convert a user-nice value to a PRIO value, one would + * start with MAX_RT_PRIO, add the user-nice value, and then add 20 to make up for the + * fact that the highest possible priority user-nice value is -20. Converting from a + * PRIO value to a user-nice value is just the opposite. This is what the + * NICE_TO_PRIO(nice) and PRIO_TO_NICE(prio) macros do. + * + * TASK_NICE(p) simply gets the user-nice value for a given task. Each task has a + * static and a dynamic priority value. The static priority value is set by users + * via the nice() system call and ranges from -20 to 19. It is stored as a PRIO. The + * dynamic priority is based on a task's static priority, but it is modified based + * on interactivity. The dynamic priority is not relevent here, but is mentioned in + * order to explain why TASK_NICE(p) is determined by a task's static_prio field. */ #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) @@ -63,6 +112,23 @@ * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, * it's a [ 0 ... 39 ] range. + * + * USER_PRIO(p) takes an interal non-RT priority and returns its + * priority in terms of 0-39. It is only used by the other macros + * in this group as values of 0-39 don't mean anything in terms of + * internal PRIO values or user-nice values. It is simply a shortcut. + * + * TASK_USER_PRIO is not used by anything, and should be removed from + * the kernel. It is a useless calculation for the reason described above. + * All it doers is return a task's USER_PRIO. + * + * MAX_USER_PRIO returns the total number of different priority levels + * non-RT processes can have. In this case, it resoves to 40 (100-139). + * + * AVG_TIMESLICE basically resolves to the half-way point between MIN_TIMESLICE + * and MAX_TIMESLICE. The reason it isn't written simply like that is so the + * algorithm can withstand changes to the priority system. It resolves to about + * 100ms. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) @@ -72,6 +138,13 @@ /* * Some helpers for converting nanosecond timing to jiffy resolution + * + * A nanosecond (NS) is one-billionth of a second. A jiffy is a period of time + * calculated by 1/HZ, where HZ is the architecture-defined number of ticks + * per second. So, to convert from nanoseconds to jiffies, one divides a billion + * by HZ (which results in the number of nanoseconds in a jiffy), and divides + * the number of nanoseconds by that. Jiffies to NS is the same, but multiply + * the number of jiffies by the number of nanoseconds in a jiffy. */ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) @@ -79,9 +152,46 @@ /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. + * MIN_TIMESLICE is the minimum timeslice that a task can be given. It resolves to about 10ms. + * + * MAX_TIMESLCIE is the maximum timeslice that a task can be given. It resolves to about 200ms. + * + * ON_RUNQUEUE_WEIGHT ... + * + * CHILD_PENALTY is the penalty that the sleep_avg of forked child tasks gets + * in order to prevent very interactive tasks from spawning other very interactive + * tasks. + * + * PARENT_PENALTY is the penalty that the sleep_avg of parents who forked tasks + * gets in order to prevent very interactive tasks from spawning other very interactive + * tasks. + * + * EXIT_WEIGHT ... + * + * PRIO_BONUS_RATIO is the ratio used to determine MAX_BONUS. + * + * MAX_BONUS ... MAX_USER_PRIO ressolves to 40, and PRIO_BONUS_RATIO is 25. + * So essentially this means that the max bonus that can be + * given to a task is 25% of the total non-RT priority + * range. Since there are 40 possible non-RT priorities, this + * resolves to 10. + * + * INTERACTIVE_DELTA is the static component used to determine whether or not a task + * should be considered interactive. The higher this is, the more difficult it is for + * tasks to be considered interactive. See the DELTA and TASK_INTERACTIVE macros for + * more information. + * + * MAX_SLEEP_AVG is the number of jiffies that is the maximum average sleep time for + * a task. The higher a task's sleep_avg, the more interactive it is, so this essentially + * puts a limit on how interactive a task can be. + * + * STARVATION_LIMIT is the time limit for which a runnable task may be deprived of + * CPU time before it is considered to be starving. + * + * NS_MAX_SLEEP_AVG is the same as MAX_SLEEP_AVG, but in nanoseconds. + * + * CREDIT_LIMIT is used to determine whether or not a task has high or low interactivity + * credit. See the macros HIGH_CREDIT and LOW_CREDIT. */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) @@ -101,7 +211,9 @@ * If a task is 'interactive' then we reinsert it in the active * array after it has expired its current timeslice. (it will not * continue to run immediately, it will still roundrobin with - * other interactive tasks.) + * other interactive tasks.) This behavior does not prevent the expired + * and unexpired queues from ever being swapped - they will get swapped + * as soon as something in the expired queue is going to starve. * * This part scales the interactivity limit depending on niceness. * @@ -116,7 +228,9 @@ * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the - * task is rated interactive.) + * task is rated interactive. So - there are 11 columns. The middle + * column is whether or not a task with a certain user-nice level + * is considered interactive if given no + or - bonus at all.) * * Ie. nice +19 tasks can never get 'interactive' enough to be * reinserted into the active array. And only heavily CPU-hog nice -20 @@ -125,10 +239,26 @@ * too hard. */ +/* + * The process's current bonus is its sleep average in jiffies times MAX_BONUS + * divided by MAX_SLEEP_AVG. Essentially it scales a processes sleep average into + * the range MAX_BONUS. + */ #define CURRENT_BONUS(p) \ (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) +/* + * If an interactive task has too long a timeslice, it may + * be preempted by a task of equal priority. The task + * does not lose its timeslice, it is just put on the bottom of the + * list of tasks of its priority waiting to run. If there + * was a task of higher priority, it would have already preempted + * a given task. TIMESLICE_GRANULARITY is the time limit for + * what is considered "too long" a timeslice. It is called granularity + * because the timeslice is effectively broken up if it is longer than + * TIMESLICE_GRANULARITY. + */ #ifdef CONFIG_SMP #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ @@ -138,12 +268,38 @@ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) #endif +/* + * This macro is used in the TASK_INTERACTIVE macro to decide if a + * task should be considered interactive. SCALE calculates how much + * higher in priority a task must be from its nice value, minus the + * INTERACTIVE_DELTA, in order to be considered interactive. The idea + * is that tasks with a higher priority nice value should not need to + * be given as much of a bonus in order to be considered interactive + * as tasks given a lower priority nice value. So, a task with a -10 + * nice value will be more easily considered interactive than a task + * with a +10 nice value. Since INTERACTIVE_DELTA is static, SCALE + * provides a value to add to it in order to do the interactivity scaling. + */ #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) +/* + * A task must be DELTA higher in priority than its nice + * value in order to be considered interactive. This value + * is the combination of the scaled factor and the constant + * INTERACTIVE_DELTA factor. + */ #define DELTA(p) \ (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) +/* + * This macro returns whether or not a task should be considered + * interactive. If a task's priority value (lower values are higher + * priority) is less than or equal to its static_prio (i.e. nice value) + * minus DELTA, then it is interactive. This is because tasks are given + * given priority-raising bonuses (prio lowering) based on heuristics + * that measure characteristics of interactivity. + */ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) @@ -157,6 +313,10 @@ #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) +/* + * just tells whether or not there is a task in rq that + * should preempt the task p. + */ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) @@ -170,30 +330,91 @@ * * task_timeslice() is the interface that is used by the scheduler. */ - -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) * \ + (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } +/* + * The task_hot macro takes a process, the current time, and a scheduler domain. + * A scheduler domain is essentially a grouping a processors that share cache. + * task_hot determines whether or not cache in a scheduler domain is likely + * to contain data that the given process could use. The value cache_hot_time + * is the amount of time that data is likely to remain the cache. Thus, if + * the time between when the process was last run and now is less than that + * amount of time, it is likely that the cache will still be hot (i.e. contain + * relevant data). + */ #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) +/* These are the runqueue data structures: */ + /* - * These are the runqueue data structures: + * The BITMAP_SIZE macro resolves to the number of long integers + * required to create a bitmap with one bit per scheduler priority + * (there are MAX_PRIO priorities). + * + * The "...+1+7)/8" part might seem odd. MAX_PRIO + 1 covers all priorities, + * adding 7 ensures that division by 8 will result in a number > 1. */ - #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) typedef struct runqueue runqueue_t; +/* + * The prio_array data structure is extremely important as it is what allows + * the Linux scheduling algorithm to perform in O(1) time. + * + * The basic structure in the Linux scheduler is the runqueue, defined below. + * There is one runqueue per processor, and within that runqueue there are two + * structures of type prio_array. One is for tasks that have not used up their + * timeslice yet, the other is for tasks that have used up their timeslice. The + * former are considered active, the latter expired. Note that active and expired + * has nothing to do with whether or not a task is runnable - active simply means + * that since the last time timeslices were allocated, a given task in that queue + * has not used up its timeslice. A task in the active list still has time available + * on the CPU, tasks in the expired list have used up their timeslice. + * + * The nr_active value stores the number of runnable tasks in the prio_array. The + * bitmap is a string of bits, one for each priority level on the system (140 by + * default), that indicates whether or not there are any tasks in the prio_array + * at a given priority level. The queue value is an array of pointers to arrays + * that store all tasks at a given priority level. + * + * So if there is only one runnable task in the prio_array, nr_active will be equal to + * one. If that task is not RT, and it has a nice value of 20, there will be + * a one in the 119th position of the bitmap to indicate that there is a task in the + * prio_array at that priority level. The queue array would have a pointer at the 119th + * position pointing to an array of length 1, its single element being the task in question. + * + * This is very useful because in order to determine the next task to run, the scheduler simply + * 1) looks to see if there are any runnable tasks in its active prio_array (i.e. is nr_active > 0) + * 2) if so, go to step 3 otherwise go to step 6 + * 3) find the first 1 in the active prio_array's bitmap. There must be a 1 somewhere since + * we know that there is a task in the prio_array and it must have a priority level. + * 4) run the first task in the array at the position in the prio_array's queue equal to + * the first 1 found in the bitmap. + * 5) when the task is done running for some reason, recalculate its new timeslice and put it + * in the expired prio_array. decement nr_active in the active prio_array, and increment + * it in the expired prio_array. if the task was the last task at a given priority, + * clear the priority's bit in the active prio_array and make sure the priority's bit + * is set in the expired prio_array. repeat steps 1-4 until no tasks exist in the active + * prio_array. + * 6) when no tasks exist in the active prio_array, swap the active and inactive prio_arrays + * and start over again. since timeslices are recalculated for each process when + * it is put onto the expired array, the swap of prio_arrays is fast (i.e. no + * sitting around recalculating a timeslice for every task) + * + * This results in O(1) behavior since no step in the process requires iterating over a number + * of tasks that grows larger when the total number of tasks grows. + */ struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; + unsigned int nr_active; /* number of runnable tasks in this prio_array */ + unsigned long bitmap[BITMAP_SIZE]; /* bitmap showing which priority levels contain tasks */ + struct list_head queue[MAX_PRIO]; /* a list of array heads, one for each priority on the system */ }; /* @@ -204,50 +425,61 @@ struct prio_array { * acquire operations must be ordered by ascending &runqueue. */ struct runqueue { - spinlock_t lock; + spinlock_t lock; /* lock that protects this runqueue */ - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; /* number of runnable tasks */ #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load; /* this CPU's load */ #endif - unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; - struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; - atomic_t nr_iowait; + unsigned long long nr_switches; /* number of context switches */ + unsigned long expired_timestamp, nr_uninterruptible; /* time of last array swap and number of + uninterruptible processes in queue */ + unsigned long long timestamp_last_tick; /* timestamp of last scheduler tick */ + task_t *curr, *idle; /* this processors current and idle task */ + struct mm_struct *prev_mm; /* the last running task's mm_struct */ + prio_array_t *active, *expired, arrays[2]; /* the active and expired prio_arrays */ + int best_expired_prio; /* highest priority that exists in the expired prio_array */ + atomic_t nr_iowait; /* number of tasks in the queue waiting on i/o */ #ifdef CONFIG_SMP - struct sched_domain *sd; + struct sched_domain *sd; /* in SMP systems there can be different scheduler domains */ /* For active balancing */ - int active_balance; + int active_balance; /* */ int push_cpu; + /* this migration thread for the processor that this runqueue belongs to */ task_t *migration_thread; struct list_head migration_queue; #endif }; +/* Define one runqueue per CPU. */ static DEFINE_PER_CPU(struct runqueue, runqueues); +/* Iterate through domains that a CPU is a part of */ #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +/* + * cpu_rq gets the runqueue for a given cpu + * + * this_rq gets the runqueue for the current cpu + * + * task_rq gets the runqueue that a certain task is in + * + * cpu_curr gets the current task on a given CPU + */ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ +/* Default context-switch locking */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) @@ -264,23 +496,28 @@ static runqueue_t *task_rq_lock(task_t * struct runqueue *rq; repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); + local_irq_save(*flags); /* save irq flags */ + rq = task_rq(p); /* get runqueue for the task */ + spin_lock(&rq->lock); /* lock the runqueue */ + /* make sure the task is still on the runqueue we just locked */ if (unlikely(rq != task_rq(p))) { + /* if not, unlock and restore irq flags, then try again */ spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; } return rq; } +/* simply unlock a runqueue, not as touchy as locking! */ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) { spin_unlock_irqrestore(&rq->lock, *flags); } /* - * rq_lock - lock a given runqueue and disable interrupts. + * rq_lock - lock the current processor's runqueue and disable interrupts. + * Since the current CPU is executing this code, its runqueue is easier to + * lock than if we were trying to lock some other CPU's runqueue (see task_rq_lock()). */ static runqueue_t *this_rq_lock(void) { @@ -293,6 +530,10 @@ static runqueue_t *this_rq_lock(void) return rq; } +/* + * A convenience method for making sure that runqueues get unlocked + * via the right lock mechanism. + */ static inline void rq_unlock(runqueue_t *rq) { spin_unlock_irq(&rq->lock); @@ -303,24 +544,37 @@ static inline void rq_unlock(runqueue_t */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { - array->nr_active--; + array->nr_active--; /* one less active task in the array */ list_del(&p->run_list); + /* + * Clear the bit that says there is a task in the prio array with a certain priority + * if no more tasks at p's priority in the prio array. + */ if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { + /* add the task at the right spot in the prio array */ list_add_tail(&p->run_list, array->queue + p->prio); + /* + * set the bit that says there is at least one task in the prio array + * with priority p->prio + */ __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + array->nr_active++; /* one more active task in the array */ + p->array = array; /* set the field in the task that says what prio array it is in */ } /* - * Used by the migration code - we pull tasks from the head of the - * remote queue so we want these tasks to show up at the head of the - * local queue: + * Migration code always has the highest priority. When CPUs go down (become + * idle), the idle task must get a higher priority than the migration code. + * This function is used by __activate_idle_task, which is called by + * sched_idle_next. sched_idle_next is called when CPUs get taken down. + * + * This is really similar to enqueue task, except it adds to the top of the list + * instead of the tail (list_add() instead of list_add_tail()). */ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { @@ -347,13 +601,28 @@ static inline void enqueue_task_head(str static int effective_prio(task_t *p) { int bonus, prio; - + + /* don't do anything if this is a RT task */ if (rt_task(p)) return p->prio; + /* + * take the CURRENT_BONUS, which is sleep_avg mapped onto + * 0-MAX_BONUS, and subtract half of MAX_BONUS since it is + * twice the possible + or - bonus. So if MAX_BONUS is 10, + * and a task sleeps a lot, it might get a CURRENT_BONUS of + * say, 8. Subtracting 5, that makes 3. This will be subtracted + * from static_prio since the task should have a high priority + * and lower prio values are higher priority. If a task sleeps + * very little, the bonus value calculated here will be negative. + * In that case, the negative value will get subtracted from + * static_prio, lowering the priority. + */ bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + /* give the task a prio based on the just-calculated bonus and static_prio */ prio = p->static_prio - bonus; + /* make sure the prio value is within non-RT bounds and return it */ if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) @@ -379,8 +648,21 @@ static inline void __activate_idle_task( rq->nr_running++; } +/* + * This function recalculates a task's priority ("I know this because I can + * read" - John Fraser Hart). It is called by the main schedule() function + * when a task is moved to the expired prio array, and also when tasks are + * activated. + */ static void recalc_task_prio(task_t *p, unsigned long long now) { + /* + * __sleep_time is used because an unsigned long long will be able + * to hold a huge number, which might be the case in the calculation + * of "now - p-> timestamp" but will not be the case if the number + * is kept <= NS_MAX_SLEEP_AVG. So, once the number is calculated to + * be <= NS_MAX_SLEEP_AVG, then the unsigned long sleep_time is used. + */ unsigned long long __sleep_time = now - p->timestamp; unsigned long sleep_time; @@ -393,7 +675,7 @@ static void recalc_task_prio(task_t *p, /* * User tasks that sleep a long time are categorised as * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving + * prevent them from suddenly becoming cpu hogs and starving * other processes. */ if (p->mm && p->activated != -1 && @@ -405,7 +687,9 @@ static void recalc_task_prio(task_t *p, } else { /* * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. + * rapidly it will rise with sleep time. If a task + * has a high sleep avg, CURRENT_BONUS(p) will be high, + * and thus MAX_BONUS - CURRENT_BONUS(p) will be low. */ sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; @@ -507,7 +791,13 @@ static void activate_task(task_t *p, run */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { + /* one less running task */ rq->nr_running--; + /* + * this is leaving the running state and + * becoming uninterruptible, so increment + * nr_uninterruptible + */ if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -527,7 +817,7 @@ static void resched_task(task_t *p) int need_resched, nrpolling; preempt_disable(); - /* minimise the chance of sending an interrupt to poll_idle() */ + /* minimize the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); @@ -543,15 +833,19 @@ static inline void resched_task(task_t * } #endif -/** +/* * task_curr - is this task currently executing on a CPU? - * @p: the task in question. */ inline int task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } +/* + * This section contains code for migrating tasks between CPUs on + * SMP systems + */ + #ifdef CONFIG_SMP enum request_type { REQ_MOVE_TASK, @@ -563,11 +857,11 @@ typedef struct { enum request_type type; /* For REQ_MOVE_TASK */ - task_t *task; - int dest_cpu; + task_t *task; /* task to operate on */ + int dest_cpu; /* if REQ_MOVE_TASK, this is the destination CPU */ /* For REQ_SET_DOMAIN */ - struct sched_domain *sd; + struct sched_domain *sd; /* destination domain */ struct completion done; } migration_req_t; @@ -589,6 +883,10 @@ static int migrate_task(task_t *p, int d return 0; } + /* + * fill in migration request fields and add task to a + * migration queue, to be migrated later + */ init_completion(&req->done); req->type = REQ_MOVE_TASK; req->task = p; @@ -640,6 +938,13 @@ void kick_process(task_t *p) preempt_disable(); cpu = task_cpu(p); + /* + * If the process is on this CPU, then its already in kernel mode, because we're + * executing right now. In that case, don't tell it to reschedule. If the process + * is not the current process on some CPU, then kernel mode must kick in before + * it runs so again, don't bother rescheduling it. It should be obvious why this + * function doesn't apply on a UP system. + */ if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); preempt_enable(); @@ -661,9 +966,7 @@ static inline unsigned long source_load( return min(rq->cpu_load, load_now); } -/* - * Return a high guess at the load of a migration-target cpu - */ +/* Return a high guess at the load of a migration-target cpu */ static inline unsigned long target_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); @@ -672,7 +975,7 @@ static inline unsigned long target_load( return max(rq->cpu_load, load_now); } -#endif +#endif /* CONFIG_SMP */ /* * wake_idle() is useful especially on SMT architectures to wake a @@ -689,16 +992,28 @@ static int wake_idle(int cpu, task_t *p) struct sched_domain *sd; int i; + /* if the task is already on an idle CPU, leave it there */ if (idle_cpu(cpu)) return cpu; + /* don't change CPUs if the scheduler domain does not support WAKE_IDLE */ sd = rq->sd; if (!(sd->flags & SD_WAKE_IDLE)) return cpu; + /* + * First, put the &'ed value of the scheduler domain span + * and the online CPU map into tmp. Then, & tmp with the + * cpus that p is allowed to run on. That gives a list + * of potential CPUs in the map tmp. + */ cpus_and(tmp, sd->span, cpu_online_map); cpus_and(tmp, tmp, p->cpus_allowed); + /* + * cycle through the cpu map tmp, made above, + * and send the task to the first idle CPU. + */ for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; @@ -739,26 +1054,35 @@ static int try_to_wake_up(task_t * p, un int new_cpu; #endif + /* + * lock the task's runqueue, disabling interrupts, + * then check to see if the task is in one of the + * states we wish to wake it from. If not, get out. + */ rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) goto out; + /* the task is already awake if it is in a prio array! */ if (p->array) goto out_running; - + cpu = task_cpu(p); this_cpu = smp_processor_id(); #ifdef CONFIG_SMP + /* if the task is running but was interrupted, we just need to activate it */ if (unlikely(task_running(rq, p))) goto out_activate; new_cpu = cpu; + /* if the task's CPU is this CPU or this CPU is not one it is allowed on... */ if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; + /* grab the load on the source and target CPUs */ load = source_load(cpu); this_load = target_load(this_cpu); @@ -809,8 +1133,10 @@ out_set_cpu: /* might preempt at this point */ rq = task_rq_lock(p, &flags); old_state = p->state; + /* If the state of p is not one we wish to wake from, get out */ if (!(old_state & state)) goto out; + /* if p is in a prio array, it is already running */ if (p->array) goto out_running; @@ -852,6 +1178,7 @@ out: return success; } +/* just an exported convenience function for try_to_wake_up() */ int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | @@ -899,7 +1226,8 @@ void fastcall sched_fork(task_t *p) p->time_slice = (current->time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. + * the parent if the child exits early enough. Set first_time_slice + * in order to indicate that that p's timeslice is reclaimable. */ p->first_time_slice = 1; current->time_slice >>= 1; @@ -930,6 +1258,7 @@ void fastcall wake_up_forked_process(tas unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); + /* The freshly forked process should not already be running! */ BUG_ON(p->state != TASK_RUNNING); /* @@ -943,14 +1272,17 @@ void fastcall wake_up_forked_process(tas p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + /* Start of with interactive credit of 0. */ p->interactive_credit = 0; + /* Set an initial priority and CPU. The initial CPU is the current CPU. */ p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); + /* If the task is not already on a runqueue prio array, put it on one. */ if (unlikely(!current->array)) __activate_task(p, rq); - else { + else { /* Otherwise just situate it in the runqueue its in. */ p->prio = current->prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; @@ -975,6 +1307,10 @@ void fastcall sched_exit(task_t * p) runqueue_t *rq; local_irq_save(flags); + /* + * if the exiting child was only on its first time slice, + * give it back to the parent + */ if (p->first_time_slice) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) @@ -1055,6 +1391,10 @@ task_t * context_switch(runqueue_t *rq, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; + /* + * If the new task doesn't have an mm, make it the same + * as the old task's. + */ if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); @@ -1062,6 +1402,13 @@ task_t * context_switch(runqueue_t *rq, } else switch_mm(oldmm, mm, next); + /* + * If the previous task does not have an mm, + * set its active_mm field to NULL, warn, and + * then set the runqueue's previous mm to the previous + * task's active_mm for use in making good cache hotness + * decisions in the future. + */ if (unlikely(!prev->mm)) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); @@ -1177,9 +1524,15 @@ static int find_idlest_cpu(struct task_s min_cpu = UINT_MAX; min_load = ULONG_MAX; + /* set mask to a map created by + * 1) getting a bitmap of online CPUs in the right scheduler domain + * 2) & the map from step 1 with p's allowed CPU + * The result is a map of CPUs that p could potentially run on. + */ cpus_and(mask, sd->span, cpu_online_map); cpus_and(mask, mask, p->cpus_allowed); + /* cycle through each CPU looking for the one with the lowest load */ for_each_cpu_mask(i, mask) { load = target_load(i); @@ -1226,11 +1579,13 @@ void fastcall wake_up_forked_thread(task /* * Find the largest domain that this CPU is part of that - * is willing to balance on clone: + * is willing to balance on clone; that is, a domain willing + * to accept cloned tasks onto its CPUs. */ for_each_domain(this_cpu, tmp) if (tmp->flags & SD_BALANCE_CLONE) sd = tmp; + /* If a domain was found, choose its idlest CPU, otherwise just use this CPU */ if (sd) cpu = find_idlest_cpu(p, this_cpu, sd); else @@ -1256,7 +1611,9 @@ lock_again: /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. + * from forking tasks that are max-interactive. This is similar + * to what we do when new processes are forked + * (in wake_up_forked_process()) */ current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); @@ -1341,10 +1698,16 @@ void sched_balance_exec(void) if (this_rq()->nr_running <= 1) goto out; + /* + * Find the largest domain this CPU belongs to that is willing to + * balance on exec. + */ for_each_domain(this_cpu, tmp) if (tmp->flags & SD_BALANCE_EXEC) sd = tmp; + /* If a domain was found, find its idlest CPU and migrate there + * Otherwise, just stay on this CPU. */ if (sd) { new_cpu = find_idlest_cpu(current, this_cpu, sd); if (new_cpu != this_cpu) { @@ -1385,10 +1748,11 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + /* account for differences in timestamp between CPUs */ + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* - * Note that idle threads have a prio of MAX_PRIO, for this test + * Note that idle threads have a prio of MAX_PRIO, so this test * to be always true for them. */ if (TASK_PREEMPTS_CURR(p, this_rq)) @@ -1514,17 +1878,21 @@ find_busiest_group(struct sched_domain * max_load = this_load = total_load = total_pwr = 0; + /* go through each group, done with a do loop since this is a circular linked list */ do { cpumask_t tmp; unsigned long load; int local_group; int i, nr_cpus = 0; + /* is the current CPU in the group we're looking at? */ local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ avg_load = 0; + /* make a map, tmp, of CPUs in this group and online */ cpus_and(tmp, group->cpumask, cpu_online_map); + /* if this group doesn't contain any online CPUs, move on */ if (unlikely(cpus_empty(tmp))) goto nextgroup; @@ -1539,6 +1907,10 @@ find_busiest_group(struct sched_domain * avg_load += load; } + /* + * This check is redundant since it can never be true, and has + * apparently been fixed in Linux 2.6.10rc3 + */ if (!nr_cpus) goto nextgroup; @@ -1551,7 +1923,7 @@ find_busiest_group(struct sched_domain * if (local_group) { this_load = avg_load; this = group; - goto nextgroup; + goto nextgroup; /* pointless goto since it goes there anyway */ } else if (avg_load > max_load) { max_load = avg_load; busiest = group; @@ -1573,7 +1945,7 @@ nextgroup: * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong + * actions would just result in more rebalancing later, and ping-ponging * tasks around. Thus we look for the minimum possible imbalance. * Negative imbalances (*we* are more loaded than anyone else) will * be counted as no imbalance for these purposes -- we can't fix that @@ -1985,6 +2357,7 @@ void scheduler_tick(int user_ticks, int runqueue_t *rq = this_rq(); task_t *p = current; + /* update last tick timestamp to now */ rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) @@ -1998,24 +2371,36 @@ void scheduler_tick(int user_ticks, int cpustat->softirq += sys_ticks; sys_ticks = 0; } - + + /* if the current task is the idle task... */ if (p == rq->idle) { + /* If at least one task is waiting on i/o, then + * the the time since the last tick was spent waiting + * on I/O, and that is why we're idle. Otherwise, we just + * have nothing to do. Update cpustat accordingly. + */ if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + /* wke up a priority sleeper since we're idle for one reason or another */ if (wake_priority_sleeper(rq)) goto out; + /* if we couldn't wake anything up, then try to rebalance */ rebalance_tick(cpu, rq, IDLE); + /* leave since we were idle and did what we could */ return; } + if (TASK_NICE(p) > 0) cpustat->nice += user_ticks; else cpustat->user += user_ticks; cpustat->system += sys_ticks; - /* Task might have expired already, but not scheduled off yet */ + /* Task might have expired already, but not scheduled off yet. + * Possible since we're in a timer interrupt right now. + */ if (p->array != rq->active) { set_tsk_need_resched(p); goto out; @@ -2044,22 +2429,34 @@ void scheduler_tick(int user_ticks, int } goto out_unlock; } + /* if the task is out of time */ if (!--p->time_slice) { + /* dequeue it from the active prio array */ dequeue_task(p, rq->active); + /* reschedule it */ set_tsk_need_resched(p); + /* recalculte its priority */ p->prio = effective_prio(p); + /* give it a new timeslice */ p->time_slice = task_timeslice(p); + /* + * This can't be its first timeslice since it just ran out + * of one. Remember that tasks that exit on their first timeslice + * can give part of their timeslice back to the parent task. + */ p->first_time_slice = 0; if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; + /* if the task is not interactive or there is something starving on the expired list */ if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + /* enqueue the task on the expired list */ enqueue_task(p, rq->expired); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; - } else + } else /* otherwise put it back on the active list */ enqueue_task(p, rq->active); - } else { + } else { /* task is not out of time */ /* * Prevent a too long timeslice allowing a task to monopolize * the CPU. We do this by splitting up the timeslice into @@ -2088,12 +2485,22 @@ void scheduler_tick(int user_ticks, int } } out_unlock: + /* we are done messing with this runqueue so unlock it */ spin_unlock(&rq->lock); out: + /* see if we need to do some rebalancing */ rebalance_tick(cpu, rq, NOT_IDLE); } #ifdef CONFIG_SCHED_SMT +/* + * If there are other idle virtual processors associated with the given cpu, + * and they have runnable tasks, try to wake them up. This is called in + * schedule(), when the current CPU is has no runnable tasks and idle rebalancing + * fails to add any runnable tasks. This is because on SMT, tasks can be sleeping + * in order to give other sibling processors with higher priority tasks full + * access to cache. + */ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) { int i; @@ -2205,12 +2612,12 @@ asmlinkage void __sched schedule(void) } need_resched: - preempt_disable(); - prev = current; - rq = this_rq(); + preempt_disable(); /* do not allow this algorithm to be preempted */ + prev = current; /* whatever task is running now will be the previous task */ + rq = this_rq(); /* get the runqueue for the processor that needs scheduling */ release_kernel_lock(prev); - now = sched_clock(); + now = sched_clock(); /* get the current time in nanoseconds */ if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; else @@ -2226,10 +2633,6 @@ need_resched: spin_lock_irq(&rq->lock); - /* - * if entering off of a kernel preemption go straight - * to picking the next task. - */ switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; @@ -2241,6 +2644,11 @@ need_resched: } cpu = smp_processor_id(); + /* + * If there are no runnable tasks in the runqueue, try to do an idle balance. + * If nothing is runnable after that, just switch to idle. No need to swap arrays + * since there is nothing runnable in the expired array or the active one. + */ if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); if (!rq->nr_running) { @@ -2251,6 +2659,7 @@ need_resched: } } + /* If there are no runnable tasks in the active prio array, swap arrays. */ array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -2263,15 +2672,27 @@ need_resched: rq->best_expired_prio = MAX_PRIO; } + /* find the first priority level with tasks in it, and grab the first task in it */ idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + /* + * If there is a dependent sleeper, a task sleeping on a sibling virtual + * CPU on SMT systems, just switch to idle and let dependent_sleeper() wake + * up the dependent task. + */ if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; goto switch_tasks; } + /* + * If the next task is not an RT task and has been woken up, + * give it a new priority calculated with a longer sleep time + * as a boost. If the tasks is a normal first-time wakeup + * (next->activated == 1), weigh down the bonus. + */ if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -2283,13 +2704,23 @@ need_resched: recalc_task_prio(next, next->timestamp + delta); enqueue_task(next, array); } + /* clear the next task's activated status */ next->activated = 0; + + /* make the switch to whatever next task was selected */ switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + /* subtract running time from previous task's sleep_avg */ prev->sleep_avg -= run_time; + + /* + * If the previous task's sleep average is 0 or lower now, + * set it to 0 and then drop its interactive credit since + * it obviously wasn't sleeping much and is thus less I/O bound. + */ if ((long)prev->sleep_avg <= 0) { prev->sleep_avg = 0; if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) @@ -2297,6 +2728,7 @@ switch_tasks: } prev->timestamp = now; + /* make the actual context switch if prev and next are not the same */ if (likely(prev != next)) { next->timestamp = now; rq->nr_switches++; @@ -2313,6 +2745,10 @@ switch_tasks: reacquire_kernel_lock(current); preempt_enable_no_resched(); + /* + * Since preemtion was disabled this whole time, check to see if kernel + * preemption was requested (reschedule requested) and reschedule if so. + */ if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; } @@ -2331,12 +2767,18 @@ asmlinkage void __sched preempt_schedule /* * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. + * we do not want to preempt the current task. Just return. */ if (unlikely(ti->preempt_count || irqs_disabled())) return; need_resched: + /* + * Set preempt count to indicate that we are preempting, reschedule, + * and then clear the preempt count as rescheduling has happened. + * Only needs to reschedule once no matter how many time the reschedule + * was requested. + */ ti->preempt_count = PREEMPT_ACTIVE; schedule(); ti->preempt_count = 0; @@ -2350,6 +2792,7 @@ need_resched: EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ +/* exported call for trying to wake up a task */ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; @@ -2372,11 +2815,17 @@ static void __wake_up_common(wait_queue_ { struct list_head *tmp, *next; + /* go through each task in the wait queue */ list_for_each_safe(tmp, next, &q->task_list) { wait_queue_t *curr; unsigned flags; curr = list_entry(tmp, wait_queue_t, task_list); flags = curr->flags; + /* + * Try to wake up the task, and if it was exclusive and there are more + * exclusive tasks in the wait queue, then quit. Don't want to wake up + * more than one exclusive task at a time. + */ if (curr->func(curr, mode, sync, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) @@ -2428,9 +2877,11 @@ void fastcall __wake_up_sync(wait_queue_ unsigned long flags; int sync = 1; + /* obviously, leave if there is no wait queue */ if (unlikely(!q)) return; + /* if there are no exclusive tasks, don't do sync */ if (unlikely(!nr_exclusive)) sync = 0; @@ -2585,6 +3036,7 @@ void set_user_nice(task_t *p, long nice) if (array) dequeue_task(p, array); + /* set the new static_prio and just adjust the dynamic prio instead of recalculating */ old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; @@ -2743,6 +3195,7 @@ static int setscheduler(pid_t pid, int p */ rq = task_rq_lock(p, &flags); + /* makes sure the policy is sane */ if (policy < 0) policy = p->policy; else { @@ -2910,6 +3363,7 @@ asmlinkage long sys_sched_setaffinity(pi if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; + /* don't allow CPU hotplugging while we do this - obvious consequences */ lock_cpu_hotplug(); read_lock(&tasklist_lock); @@ -3397,9 +3851,9 @@ static void __migrate_task(struct task_s */ p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; - deactivate_task(p, rq_src); - activate_task(p, rq_dest, 0); - if (TASK_PREEMPTS_CURR(p, rq_dest)) + deactivate_task(p, rq_src); /* off the runqueue it is on */ + activate_task(p, rq_dest, 0); /* on the runqueue it should be on */ + if (TASK_PREEMPTS_CURR(p, rq_dest)) /* perhaps preempt dest cpu's current task */ resched_task(rq_dest->curr); } @@ -3421,25 +3875,33 @@ static int migration_thread(void * data) BUG_ON(rq->migration_thread != current); set_current_state(TASK_INTERRUPTIBLE); + + /* basically, just keep trying to be helpful in one way or another + * until we're told to die... + */ while (!kthread_should_stop()) { struct list_head *head; migration_req_t *req; + /* can I freeze the current thread for you? */ if (current->flags & PF_FREEZE) refrigerator(PF_FREEZE); spin_lock_irq(&rq->lock); + /* Is this CPU offline? If so, I'll just go die. */ if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); goto wait_to_die; } + /* does my runqueue need to be balanced? */ if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0; } + /* anything need to be migrated? If not, schedule me out. */ head = &rq->migration_queue; if (list_empty(head)) { @@ -3448,6 +3910,7 @@ static int migration_thread(void * data) set_current_state(TASK_INTERRUPTIBLE); continue; } + /* do some migration */ req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); @@ -3466,9 +3929,11 @@ static int migration_thread(void * data) complete(&req->done); } + /* migration thread suicide */ __set_current_state(TASK_RUNNING); return 0; + /* migration thread hospice... with no CPU, time is running out for us... */ wait_to_die: /* Wait for kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); @@ -3481,7 +3946,7 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ +/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ static void migrate_all_tasks(int src_cpu) { struct task_struct *tsk, *t; @@ -3729,6 +4194,7 @@ static void __init arch_init_sched_domai struct sched_group *node = &sched_group_nodes[i]; int j; + /* get mask of node cpus that are possible */ cpus_and(nodemask, tmp, cpu_possible_map); if (cpus_empty(nodemask)) @@ -3744,6 +4210,7 @@ static void __init arch_init_sched_domai cpu_set(j, cpu->cpumask); cpu->cpu_power = SCHED_LOAD_SCALE; + /* set up circular linked list */ if (!first_cpu) first_cpu = cpu; if (last_cpu)