/* * linux/kernel/sched.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'sched.c' is the main kernel file. It contains scheduling primitives * (sleep_on, wakeup, schedule etc) as well as a number of simple system * call functions (type getpid(), which just extracts a field from * current-task */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TIMER_IRQ 0 #include /* * kernel variables */ long tick = 1000000 / HZ; /* timer interrupt period */ volatile struct timeval xtime; /* The current time */ int tickadj = 500/HZ; /* microsecs */ DECLARE_TASK_QUEUE(tq_timer); DECLARE_TASK_QUEUE(tq_immediate); DECLARE_TASK_QUEUE(tq_scheduler); /* * phase-lock loop variables */ int time_status = TIME_BAD; /* clock synchronization status */ long time_offset = 0; /* time adjustment (us) */ long time_constant = 0; /* pll time constant */ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = 0x70000000;/* maximum error */ long time_esterror = 0x70000000;/* estimated error */ long time_phase = 0; /* phase offset (scaled us) */ long time_freq = 0; /* frequency offset (scaled ppm) */ long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ long time_reftime = 0; /* time at last adjustment (s) */ long time_adjust = 0; long time_adjust_step = 0; int need_resched = 0; unsigned long event = 0; extern int _setitimer(int, struct itimerval *, struct itimerval *); unsigned long * prof_buffer = NULL; unsigned long prof_len = 0; #define _S(nr) (1<<((nr)-1)) extern void mem_use(void); extern int timer_interrupt(void); static unsigned long init_kernel_stack[1024] = { STACK_MAGIC, }; unsigned long init_user_stack[1024] = { STACK_MAGIC, }; static struct vm_area_struct init_mmap = INIT_MMAP; struct task_struct init_task = INIT_TASK; unsigned long volatile jiffies=0; struct task_struct *current = &init_task; struct task_struct *last_task_used_math = NULL; struct task_struct * task[NR_TASKS] = {&init_task, }; struct kernel_stat kstat = { 0 }; unsigned long itimer_ticks = 0; unsigned long itimer_next = ~0; /* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. * The one thing you might take a look at is the signal-handler code here. * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. * * The "confuse_gcc" goto is used only to get better assembly code.. * Dijkstra probably hates me. */ asmlinkage void schedule(void) { int c; struct task_struct * p; struct task_struct * next; unsigned long ticks; /* check alarm, wake up any interruptible tasks that have got a signal */ if (intr_count) { printk("Aiee: scheduling in interrupt\n"); intr_count = 0; } run_task_queue(&tq_scheduler); cli(); ticks = itimer_ticks; itimer_ticks = 0; itimer_next = ~0; sti(); need_resched = 0; nr_running = 0; p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) goto confuse_gcc1; if (ticks && p->it_real_value) { if (p->it_real_value <= ticks) { send_sig(SIGALRM, p, 1); if (!p->it_real_incr) { p->it_real_value = 0; goto end_itimer; } do { p->it_real_value += p->it_real_incr; } while (p->it_real_value <= ticks); } p->it_real_value -= ticks; if (p->it_real_value < itimer_next) itimer_next = p->it_real_value; } end_itimer: if (p->state != TASK_INTERRUPTIBLE) continue; if (p->signal & ~p->blocked) { p->state = TASK_RUNNING; continue; } if (p->timeout && p->timeout <= jiffies) { p->timeout = 0; p->state = TASK_RUNNING; } } confuse_gcc1: /* this is the scheduler proper: */ #if 0 /* give processes that go to sleep a bit higher priority.. */ /* This depends on the values for TASK_XXX */ /* This gives smoother scheduling for some things, but */ /* can be very unfair under some circumstances, so.. */ if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state && current->counter < current->priority*2) { ++current->counter; } #endif c = -1000; next = p = &init_task; for (;;) { if ((p = p->next_task) == &init_task) goto confuse_gcc2; if (p->state == TASK_RUNNING) { nr_running++; if (p->counter > c) c = p->counter, next = p; } } confuse_gcc2: if (!c) { for_each_task(p) p->counter = (p->counter >> 1) + p->priority; } if (current == next) return; kstat.context_swtch++; switch_to(next); } asmlinkage int sys_pause(void) { current->state = TASK_INTERRUPTIBLE; schedule(); return -ERESTARTNOHAND; } /* * wake_up doesn't wake up stopped processes - they have to be awakened * with signals or similar. * * Note that this doesn't need cli-sti pairs: interrupts may not change * the wait-queue structures directly, but only call wake_up() to wake * a process. The process itself must remove the queue once it has woken. */ void wake_up(struct wait_queue **q) { struct wait_queue *tmp; struct task_struct * p; if (!q || !(tmp = *q)) return; do { if ((p = tmp->task) != NULL) { if ((p->state == TASK_UNINTERRUPTIBLE) || (p->state == TASK_INTERRUPTIBLE)) { p->state = TASK_RUNNING; if (p->counter > current->counter + 3) need_resched = 1; } } if (!tmp->next) { printk("wait_queue is bad (eip = %p)\n", __builtin_return_address(0)); printk(" q = %p\n",q); printk(" *q = %p\n",*q); printk(" tmp = %p\n",tmp); break; } tmp = tmp->next; } while (tmp != *q); } void wake_up_interruptible(struct wait_queue **q) { struct wait_queue *tmp; struct task_struct * p; if (!q || !(tmp = *q)) return; do { if ((p = tmp->task) != NULL) { if (p->state == TASK_INTERRUPTIBLE) { p->state = TASK_RUNNING; if (p->counter > current->counter + 3) need_resched = 1; } } if (!tmp->next) { printk("wait_queue is bad (eip = %p)\n", __builtin_return_address(0)); printk(" q = %p\n",q); printk(" *q = %p\n",*q); printk(" tmp = %p\n",tmp); break; } tmp = tmp->next; } while (tmp != *q); } void __down(struct semaphore * sem) { struct wait_queue wait = { current, NULL }; add_wait_queue(&sem->wait, &wait); current->state = TASK_UNINTERRUPTIBLE; while (sem->count <= 0) { schedule(); current->state = TASK_UNINTERRUPTIBLE; } current->state = TASK_RUNNING; remove_wait_queue(&sem->wait, &wait); } static inline void __sleep_on(struct wait_queue **p, int state) { unsigned long flags; struct wait_queue wait = { current, NULL }; if (!p) return; if (current == task[0]) panic("task[0] trying to sleep"); current->state = state; add_wait_queue(p, &wait); save_flags(flags); sti(); schedule(); remove_wait_queue(p, &wait); restore_flags(flags); } void interruptible_sleep_on(struct wait_queue **p) { __sleep_on(p,TASK_INTERRUPTIBLE); } void sleep_on(struct wait_queue **p) { __sleep_on(p,TASK_UNINTERRUPTIBLE); } /* * The head for the timer-list has a "expires" field of MAX_UINT, * and the sorting routine counts on this.. */ static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL }; #define SLOW_BUT_DEBUGGING_TIMERS 1 void add_timer(struct timer_list * timer) { unsigned long flags; struct timer_list *p; #if SLOW_BUT_DEBUGGING_TIMERS if (timer->next || timer->prev) { printk("add_timer() called with non-zero list from %p\n", __builtin_return_address(0)); return; } #endif p = &timer_head; timer->expires += jiffies; save_flags(flags); cli(); do { p = p->next; } while (timer->expires > p->expires); timer->next = p; timer->prev = p->prev; p->prev = timer; timer->prev->next = timer; restore_flags(flags); } int del_timer(struct timer_list * timer) { unsigned long flags; #if SLOW_BUT_DEBUGGING_TIMERS struct timer_list * p; p = &timer_head; save_flags(flags); cli(); while ((p = p->next) != &timer_head) { if (p == timer) { timer->next->prev = timer->prev; timer->prev->next = timer->next; timer->next = timer->prev = NULL; restore_flags(flags); timer->expires -= jiffies; return 1; } } if (timer->next || timer->prev) printk("del_timer() called from %p with timer not initialized\n", __builtin_return_address(0)); restore_flags(flags); return 0; #else save_flags(flags); cli(); if (timer->next) { timer->next->prev = timer->prev; timer->prev->next = timer->next; timer->next = timer->prev = NULL; restore_flags(flags); timer->expires -= jiffies; return 1; } restore_flags(flags); return 0; #endif } unsigned long timer_active = 0; struct timer_struct timer_table[32]; /* * Hmm.. Changed this, as the GNU make sources (load.c) seems to * imply that avenrun[] is the standard name for this kind of thing. * Nothing else seems to be standardized: the fractional size etc * all seem to differ on different machines. */ unsigned long avenrun[3] = { 0,0,0 }; /* * Nr of active tasks - counted in fixed-point numbers */ static unsigned long count_active_tasks(void) { struct task_struct **p; unsigned long nr = 0; for(p = &LAST_TASK; p > &FIRST_TASK; --p) if (*p && ((*p)->state == TASK_RUNNING || (*p)->state == TASK_UNINTERRUPTIBLE || (*p)->state == TASK_SWAPPING)) nr += FIXED_1; return nr; } static inline void calc_load(void) { unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; if (count-- > 0) return; count = LOAD_FREQ; active_tasks = count_active_tasks(); CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); } /* * this routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. * They were originally developed for SUN and DEC kernels. * All the kudos should go to Dave for this stuff. * * These were ported to Linux by Philip Gladstone. */ static void second_overflow(void) { long ltemp; /* Bump the maxerror field */ time_maxerror = (0x70000000-time_maxerror < time_tolerance) ? 0x70000000 : (time_maxerror + time_tolerance); /* Run the PLL */ if (time_offset < 0) { ltemp = (-(time_offset+1) >> (SHIFT_KG + time_constant)) + 1; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); time_offset += (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); time_adj = - time_adj; } else if (time_offset > 0) { ltemp = ((time_offset-1) >> (SHIFT_KG + time_constant)) + 1; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); time_offset -= (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); } else { time_adj = 0; } time_adj += (time_freq >> (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE)) + FINETUNE; /* Handle the leap second stuff */ switch (time_status) { case TIME_INS: /* ugly divide should be replaced */ if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; /* !! */ time_status = TIME_OOP; printk("Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: /* ugly divide should be replaced */ if (xtime.tv_sec % 86400 == 86399) { xtime.tv_sec++; time_status = TIME_OK; printk("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: time_status = TIME_OK; break; } } /* * disregard lost ticks for now.. We don't care enough. */ static void timer_bh(void * unused) { unsigned long mask; struct timer_struct *tp; struct timer_list * timer; cli(); while ((timer = timer_head.next) != &timer_head && timer->expires < jiffies) { void (*fn)(unsigned long) = timer->function; unsigned long data = timer->data; timer->next->prev = timer->prev; timer->prev->next = timer->next; timer->next = timer->prev = NULL; sti(); fn(data); cli(); } sti(); for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { if (mask > timer_active) break; if (!(mask & timer_active)) continue; if (tp->expires > jiffies) continue; timer_active &= ~mask; tp->fn(); sti(); } } void tqueue_bh(void * unused) { run_task_queue(&tq_timer); } void immediate_bh(void * unused) { run_task_queue(&tq_immediate); } /* * The int argument is really a (struct pt_regs *), in case the * interrupt wants to know from where it was called. The timer * irq uses this to decide if it should update the user or system * times. */ static void do_timer(int irq, struct pt_regs * regs) { unsigned long mask; struct timer_struct *tp; /* last time the cmos clock got updated */ static long last_rtc_update=0; extern int set_rtc_mmss(unsigned long); long ltemp, psecs; /* Advance the phase, once it gets to one microsecond, then * advance the tick more. */ time_phase += time_adj; if (time_phase < -FINEUSEC) { ltemp = -time_phase >> SHIFT_SCALE; time_phase += ltemp << SHIFT_SCALE; xtime.tv_usec += tick + time_adjust_step - ltemp; } else if (time_phase > FINEUSEC) { ltemp = time_phase >> SHIFT_SCALE; time_phase -= ltemp << SHIFT_SCALE; xtime.tv_usec += tick + time_adjust_step + ltemp; } else xtime.tv_usec += tick + time_adjust_step; if (time_adjust) { /* We are doing an adjtime thing. * * Modify the value of the tick for next time. * Note that a positive delta means we want the clock * to run fast. This means that the tick should be bigger * * Limit the amount of the step for *next* tick to be * in the range -tickadj .. +tickadj */ if (time_adjust > tickadj) time_adjust_step = tickadj; else if (time_adjust < -tickadj) time_adjust_step = -tickadj; else time_adjust_step = time_adjust; /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; } else time_adjust_step = 0; if (xtime.tv_usec >= 1000000) { xtime.tv_usec -= 1000000; xtime.tv_sec++; second_overflow(); } /* If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. */ if (time_status != TIME_BAD && xtime.tv_sec > last_rtc_update + 660 && xtime.tv_usec > 500000 - (tick >> 1) && xtime.tv_usec < 500000 + (tick >> 1)) if (set_rtc_mmss(xtime.tv_sec) == 0) last_rtc_update = xtime.tv_sec; else last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ jiffies++; calc_load(); if (user_mode(regs)) { current->utime++; if (current != task[0]) { if (current->priority < 15) kstat.cpu_nice++; else kstat.cpu_user++; } /* Update ITIMER_VIRT for current task if not in a system call */ if (current->it_virt_value && !(--current->it_virt_value)) { current->it_virt_value = current->it_virt_incr; send_sig(SIGVTALRM,current,1); } } else { current->stime++; if(current != task[0]) kstat.cpu_system++; #ifdef CONFIG_PROFILE if (prof_buffer && current != task[0]) { unsigned long eip = regs->eip; eip >>= CONFIG_PROFILE_SHIFT; if (eip < prof_len) prof_buffer[eip]++; } #endif } /* * check the cpu time limit on the process. */ if ((current->rlim[RLIMIT_CPU].rlim_max != RLIM_INFINITY) && (((current->stime + current->utime) / HZ) >= current->rlim[RLIMIT_CPU].rlim_max)) send_sig(SIGKILL, current, 1); if ((current->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) && (((current->stime + current->utime) % HZ) == 0)) { psecs = (current->stime + current->utime) / HZ; /* send when equal */ if (psecs == current->rlim[RLIMIT_CPU].rlim_cur) send_sig(SIGXCPU, current, 1); /* and every five seconds thereafter. */ else if ((psecs > current->rlim[RLIMIT_CPU].rlim_cur) && ((psecs - current->rlim[RLIMIT_CPU].rlim_cur) % 5) == 0) send_sig(SIGXCPU, current, 1); } if (current != task[0] && 0 > --current->counter) { current->counter = 0; need_resched = 1; } /* Update ITIMER_PROF for the current task */ if (current->it_prof_value && !(--current->it_prof_value)) { current->it_prof_value = current->it_prof_incr; send_sig(SIGPROF,current,1); } for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { if (mask > timer_active) break; if (!(mask & timer_active)) continue; if (tp->expires > jiffies) continue; mark_bh(TIMER_BH); } itimer_ticks++; if (itimer_ticks > itimer_next) need_resched = 1; if (timer_head.next->expires < jiffies) mark_bh(TIMER_BH); if (tq_timer != &tq_last) mark_bh(TQUEUE_BH); } asmlinkage int sys_alarm(long seconds) { struct itimerval it_new, it_old; it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; it_new.it_value.tv_sec = seconds; it_new.it_value.tv_usec = 0; _setitimer(ITIMER_REAL, &it_new, &it_old); return(it_old.it_value.tv_sec + (it_old.it_value.tv_usec / 1000000)); } asmlinkage int sys_getpid(void) { return current->pid; } asmlinkage int sys_getppid(void) { return current->p_opptr->pid; } asmlinkage int sys_getuid(void) { return current->uid; } asmlinkage int sys_geteuid(void) { return current->euid; } asmlinkage int sys_getgid(void) { return current->gid; } asmlinkage int sys_getegid(void) { return current->egid; } asmlinkage int sys_nice(long increment) { int newprio; if (increment < 0 && !suser()) return -EPERM; newprio = current->priority - increment; if (newprio < 1) newprio = 1; if (newprio > 35) newprio = 35; current->priority = newprio; return 0; } static void show_task(int nr,struct task_struct * p) { unsigned long free; static char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[p->state]); else printk(" "); #ifdef __i386__ if (p == current) printk(" current "); else printk(" %08lX ", ((unsigned long *)p->tss.esp)[3]); #endif for (free = 1; free < 1024 ; free++) { if (((unsigned long *)p->kernel_stack_page)[free]) break; } printk("%5lu %5d %6d ", free << 2, p->pid, p->p_pptr->pid); if (p->p_cptr) printk("%5d ", p->p_cptr->pid); else printk(" "); if (p->p_ysptr) printk("%7d", p->p_ysptr->pid); else printk(" "); if (p->p_osptr) printk(" %5d\n", p->p_osptr->pid); else printk("\n"); } void show_state(void) { int i; printk(" free sibling\n"); printk(" task PC stack pid father child younger older\n"); for (i=0 ; i