00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include <linux/mm.h>
00030 #include <linux/module.h>
00031 #include <linux/nmi.h>
00032 #include <linux/init.h>
00033 #include <linux/uaccess.h>
00034 #include <linux/highmem.h>
00035 #include <linux/smp_lock.h>
00036 #include <asm/mmu_context.h>
00037 #include <linux/interrupt.h>
00038 #include <linux/capability.h>
00039 #include <linux/completion.h>
00040 #include <linux/kernel_stat.h>
00041 #include <linux/debug_locks.h>
00042 #include <linux/security.h>
00043 #include <linux/notifier.h>
00044 #include <linux/profile.h>
00045 #include <linux/freezer.h>
00046 #include <linux/vmalloc.h>
00047 #include <linux/blkdev.h>
00048 #include <linux/delay.h>
00049 #include <linux/pid_namespace.h>
00050 #include <linux/smp.h>
00051 #include <linux/threads.h>
00052 #include <linux/timer.h>
00053 #include <linux/rcupdate.h>
00054 #include <linux/cpu.h>
00055 #include <linux/cpuset.h>
00056 #include <linux/percpu.h>
00057 #include <linux/kthread.h>
00058 #include <linux/proc_fs.h>
00059 #include <linux/seq_file.h>
00060 #include <linux/sysctl.h>
00061 #include <linux/syscalls.h>
00062 #include <linux/times.h>
00063 #include <linux/tsacct_kern.h>
00064 #include <linux/kprobes.h>
00065 #include <linux/delayacct.h>
00066 #include <linux/reciprocal_div.h>
00067 #include <linux/unistd.h>
00068 #include <linux/pagemap.h>
00069 #include <linux/hrtimer.h>
00070 #include <linux/tick.h>
00071 #include <linux/bootmem.h>
00072 #include <linux/debugfs.h>
00073 #include <linux/ctype.h>
00074 #include <linux/ftrace.h>
00075 #include <trace/sched.h>
00076
00077 #include <asm/tlb.h>
00078 #include <asm/irq_regs.h>
00079
00080 #include "sched_cpupri.h"
00081
00082 #ifdef DDE_LINUX
00083
00084 extern int try_to_wake_up(struct task_struct *p, unsigned int state, int sync);
00085 #endif
00086
00087
00088 #ifndef DDE_LINUX
00089
00090
00091
00092
00093
00094 unsigned long long __attribute__((weak)) sched_clock(void)
00095 {
00096 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
00097 }
00098
00099
00100
00101
00102
00103
00104 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
00105 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
00106 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
00107
00108
00109
00110
00111
00112
00113 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
00114 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
00115 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
00116
00117
00118
00119
00120 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
00121
00122 #define NICE_0_LOAD SCHED_LOAD_SCALE
00123 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
00124
00125
00126
00127
00128
00129
00130
00131 #define DEF_TIMESLICE (100 * HZ / 1000)
00132
00133
00134
00135
00136 #define RUNTIME_INF ((u64)~0ULL)
00137
00138 DEFINE_TRACE(sched_wait_task);
00139 DEFINE_TRACE(sched_wakeup);
00140 DEFINE_TRACE(sched_wakeup_new);
00141 DEFINE_TRACE(sched_switch);
00142 DEFINE_TRACE(sched_migrate_task);
00143
00144 #ifdef CONFIG_SMP
00145
00146 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
00147
00148
00149
00150
00151
00152 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
00153 {
00154 return reciprocal_divide(load, sg->reciprocal_cpu_power);
00155 }
00156
00157
00158
00159
00160
00161 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
00162 {
00163 sg->__cpu_power += val;
00164 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
00165 }
00166 #endif
00167
00168 static inline int rt_policy(int policy)
00169 {
00170 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
00171 return 1;
00172 return 0;
00173 }
00174
00175 static inline int task_has_rt_policy(struct task_struct *p)
00176 {
00177 return rt_policy(p->policy);
00178 }
00179
00180
00181
00182
00183 struct rt_prio_array {
00184 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
00185 struct list_head queue[MAX_RT_PRIO];
00186 };
00187
00188 struct rt_bandwidth {
00189
00190 spinlock_t rt_runtime_lock;
00191 ktime_t rt_period;
00192 u64 rt_runtime;
00193 struct hrtimer rt_period_timer;
00194 };
00195
00196 static struct rt_bandwidth def_rt_bandwidth;
00197
00198 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
00199
00200 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
00201 {
00202 struct rt_bandwidth *rt_b =
00203 container_of(timer, struct rt_bandwidth, rt_period_timer);
00204 ktime_t now;
00205 int overrun;
00206 int idle = 0;
00207
00208 for (;;) {
00209 now = hrtimer_cb_get_time(timer);
00210 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
00211
00212 if (!overrun)
00213 break;
00214
00215 idle = do_sched_rt_period_timer(rt_b, overrun);
00216 }
00217
00218 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
00219 }
00220
00221 static
00222 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
00223 {
00224 rt_b->rt_period = ns_to_ktime(period);
00225 rt_b->rt_runtime = runtime;
00226
00227 spin_lock_init(&rt_b->rt_runtime_lock);
00228
00229 hrtimer_init(&rt_b->rt_period_timer,
00230 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
00231 rt_b->rt_period_timer.function = sched_rt_period_timer;
00232 }
00233
00234 static inline int rt_bandwidth_enabled(void)
00235 {
00236 return sysctl_sched_rt_runtime >= 0;
00237 }
00238
00239 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
00240 {
00241 ktime_t now;
00242
00243 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
00244 return;
00245
00246 if (hrtimer_active(&rt_b->rt_period_timer))
00247 return;
00248
00249 spin_lock(&rt_b->rt_runtime_lock);
00250 for (;;) {
00251 if (hrtimer_active(&rt_b->rt_period_timer))
00252 break;
00253
00254 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
00255 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
00256 hrtimer_start_expires(&rt_b->rt_period_timer,
00257 HRTIMER_MODE_ABS);
00258 }
00259 spin_unlock(&rt_b->rt_runtime_lock);
00260 }
00261
00262 #ifdef CONFIG_RT_GROUP_SCHED
00263 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
00264 {
00265 hrtimer_cancel(&rt_b->rt_period_timer);
00266 }
00267 #endif
00268
00269
00270
00271
00272
00273 static DEFINE_MUTEX(sched_domains_mutex);
00274
00275 #ifdef CONFIG_GROUP_SCHED
00276
00277 #include <linux/cgroup.h>
00278
00279 struct cfs_rq;
00280
00281 static LIST_HEAD(task_groups);
00282
00283
00284 struct task_group {
00285 #ifdef CONFIG_CGROUP_SCHED
00286 struct cgroup_subsys_state css;
00287 #endif
00288
00289 #ifdef CONFIG_USER_SCHED
00290 uid_t uid;
00291 #endif
00292
00293 #ifdef CONFIG_FAIR_GROUP_SCHED
00294
00295 struct sched_entity **se;
00296
00297 struct cfs_rq **cfs_rq;
00298 unsigned long shares;
00299 #endif
00300
00301 #ifdef CONFIG_RT_GROUP_SCHED
00302 struct sched_rt_entity **rt_se;
00303 struct rt_rq **rt_rq;
00304
00305 struct rt_bandwidth rt_bandwidth;
00306 #endif
00307
00308 struct rcu_head rcu;
00309 struct list_head list;
00310
00311 struct task_group *parent;
00312 struct list_head siblings;
00313 struct list_head children;
00314 };
00315
00316 #ifdef CONFIG_USER_SCHED
00317
00318
00319 void set_tg_uid(struct user_struct *user)
00320 {
00321 user->tg->uid = user->uid;
00322 }
00323
00324
00325
00326
00327
00328
00329 struct task_group root_task_group;
00330
00331 #ifdef CONFIG_FAIR_GROUP_SCHED
00332
00333 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
00334
00335 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
00336 #endif
00337
00338 #ifdef CONFIG_RT_GROUP_SCHED
00339 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
00340 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
00341 #endif
00342 #else
00343 #define root_task_group init_task_group
00344 #endif
00345
00346
00347
00348
00349 static DEFINE_SPINLOCK(task_group_lock);
00350
00351 #ifdef CONFIG_FAIR_GROUP_SCHED
00352 #ifdef CONFIG_USER_SCHED
00353 # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
00354 #else
00355 # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
00356 #endif
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366 #define MIN_SHARES 2
00367 #define MAX_SHARES (1UL << 18)
00368
00369 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
00370 #endif
00371
00372
00373
00374
00375 struct task_group init_task_group;
00376
00377
00378 static inline struct task_group *task_group(struct task_struct *p)
00379 {
00380 struct task_group *tg;
00381
00382 #ifdef CONFIG_USER_SCHED
00383 rcu_read_lock();
00384 tg = __task_cred(p)->user->tg;
00385 rcu_read_unlock();
00386 #elif defined(CONFIG_CGROUP_SCHED)
00387 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
00388 struct task_group, css);
00389 #else
00390 tg = &init_task_group;
00391 #endif
00392 return tg;
00393 }
00394
00395
00396 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
00397 {
00398 #ifdef CONFIG_FAIR_GROUP_SCHED
00399 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
00400 p->se.parent = task_group(p)->se[cpu];
00401 #endif
00402
00403 #ifdef CONFIG_RT_GROUP_SCHED
00404 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
00405 p->rt.parent = task_group(p)->rt_se[cpu];
00406 #endif
00407 }
00408
00409 #else
00410
00411 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
00412 static inline struct task_group *task_group(struct task_struct *p)
00413 {
00414 return NULL;
00415 }
00416
00417 #endif
00418
00419
00420 struct cfs_rq {
00421 struct load_weight load;
00422 unsigned long nr_running;
00423
00424 u64 exec_clock;
00425 u64 min_vruntime;
00426
00427 struct rb_root tasks_timeline;
00428 struct rb_node *rb_leftmost;
00429
00430 struct list_head tasks;
00431 struct list_head *balance_iterator;
00432
00433
00434
00435
00436
00437 struct sched_entity *curr, *next, *last;
00438
00439 unsigned int nr_spread_over;
00440
00441 #ifdef CONFIG_FAIR_GROUP_SCHED
00442 struct rq *rq;
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452 struct list_head leaf_cfs_rq_list;
00453 struct task_group *tg;
00454
00455 #ifdef CONFIG_SMP
00456
00457
00458
00459 unsigned long task_weight;
00460
00461
00462
00463
00464
00465
00466
00467 unsigned long h_load;
00468
00469
00470
00471
00472 unsigned long shares;
00473
00474
00475
00476
00477 unsigned long rq_weight;
00478 #endif
00479 #endif
00480 };
00481
00482
00483 struct rt_rq {
00484 struct rt_prio_array active;
00485 unsigned long rt_nr_running;
00486 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
00487 int highest_prio;
00488 #endif
00489 #ifdef CONFIG_SMP
00490 unsigned long rt_nr_migratory;
00491 int overloaded;
00492 #endif
00493 int rt_throttled;
00494 u64 rt_time;
00495 u64 rt_runtime;
00496
00497 spinlock_t rt_runtime_lock;
00498
00499 #ifdef CONFIG_RT_GROUP_SCHED
00500 unsigned long rt_nr_boosted;
00501
00502 struct rq *rq;
00503 struct list_head leaf_rt_rq_list;
00504 struct task_group *tg;
00505 struct sched_rt_entity *rt_se;
00506 #endif
00507 };
00508
00509 #ifdef CONFIG_SMP
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519 struct root_domain {
00520 atomic_t refcount;
00521 cpumask_var_t span;
00522 cpumask_var_t online;
00523
00524
00525
00526
00527
00528 cpumask_var_t rto_mask;
00529 atomic_t rto_count;
00530 #ifdef CONFIG_SMP
00531 struct cpupri cpupri;
00532 #endif
00533 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
00534
00535
00536
00537
00538
00539 unsigned int sched_mc_preferred_wakeup_cpu;
00540 #endif
00541 };
00542
00543
00544
00545
00546
00547 static struct root_domain def_root_domain;
00548
00549 #endif
00550
00551
00552
00553
00554
00555
00556
00557
00558 struct rq {
00559
00560 spinlock_t lock;
00561
00562
00563
00564
00565
00566 unsigned long nr_running;
00567 #define CPU_LOAD_IDX_MAX 5
00568 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
00569 unsigned char idle_at_tick;
00570 #ifdef CONFIG_NO_HZ
00571 unsigned long last_tick_seen;
00572 unsigned char in_nohz_recently;
00573 #endif
00574
00575 struct load_weight load;
00576 unsigned long nr_load_updates;
00577 u64 nr_switches;
00578
00579 struct cfs_rq cfs;
00580 struct rt_rq rt;
00581
00582 #ifdef CONFIG_FAIR_GROUP_SCHED
00583
00584 struct list_head leaf_cfs_rq_list;
00585 #endif
00586 #ifdef CONFIG_RT_GROUP_SCHED
00587 struct list_head leaf_rt_rq_list;
00588 #endif
00589
00590
00591
00592
00593
00594
00595
00596 unsigned long nr_uninterruptible;
00597
00598 struct task_struct *curr, *idle;
00599 unsigned long next_balance;
00600 struct mm_struct *prev_mm;
00601
00602 u64 clock;
00603
00604 atomic_t nr_iowait;
00605
00606 #ifdef CONFIG_SMP
00607 struct root_domain *rd;
00608 struct sched_domain *sd;
00609
00610
00611 int active_balance;
00612 int push_cpu;
00613
00614 int cpu;
00615 int online;
00616
00617 unsigned long avg_load_per_task;
00618
00619 struct task_struct *migration_thread;
00620 struct list_head migration_queue;
00621 #endif
00622
00623 #ifdef CONFIG_SCHED_HRTICK
00624 #ifdef CONFIG_SMP
00625 int hrtick_csd_pending;
00626 struct call_single_data hrtick_csd;
00627 #endif
00628 struct hrtimer hrtick_timer;
00629 #endif
00630
00631 #ifdef CONFIG_SCHEDSTATS
00632
00633 struct sched_info rq_sched_info;
00634 unsigned long long rq_cpu_time;
00635
00636
00637
00638 unsigned int yld_exp_empty;
00639 unsigned int yld_act_empty;
00640 unsigned int yld_both_empty;
00641 unsigned int yld_count;
00642
00643
00644 unsigned int sched_switch;
00645 unsigned int sched_count;
00646 unsigned int sched_goidle;
00647
00648
00649 unsigned int ttwu_count;
00650 unsigned int ttwu_local;
00651
00652
00653 unsigned int bkl_count;
00654 #endif
00655 };
00656
00657 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
00658
00659 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
00660 {
00661 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
00662 }
00663
00664 static inline int cpu_of(struct rq *rq)
00665 {
00666 #ifdef CONFIG_SMP
00667 return rq->cpu;
00668 #else
00669 return 0;
00670 #endif
00671 }
00672
00673
00674
00675
00676
00677
00678
00679
00680 #define for_each_domain(cpu, __sd) \
00681 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
00682
00683 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
00684 #define this_rq() (&__get_cpu_var(runqueues))
00685 #define task_rq(p) cpu_rq(task_cpu(p))
00686 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
00687
00688 static inline void update_rq_clock(struct rq *rq)
00689 {
00690 rq->clock = sched_clock_cpu(cpu_of(rq));
00691 }
00692
00693
00694
00695
00696 #ifdef CONFIG_SCHED_DEBUG
00697 # define const_debug __read_mostly
00698 #else
00699 # define const_debug static const
00700 #endif
00701
00702
00703
00704
00705
00706
00707
00708
00709 int runqueue_is_locked(void)
00710 {
00711 int cpu = get_cpu();
00712 struct rq *rq = cpu_rq(cpu);
00713 int ret;
00714
00715 ret = spin_is_locked(&rq->lock);
00716 put_cpu();
00717 return ret;
00718 }
00719
00720
00721
00722
00723
00724 #define SCHED_FEAT(name, enabled) \
00725 __SCHED_FEAT_##name ,
00726
00727 enum {
00728 #include "sched_features.h"
00729 };
00730
00731 #undef SCHED_FEAT
00732
00733 #define SCHED_FEAT(name, enabled) \
00734 (1UL << __SCHED_FEAT_##name) * enabled |
00735
00736 const_debug unsigned int sysctl_sched_features =
00737 #include "sched_features.h"
00738 0;
00739
00740 #undef SCHED_FEAT
00741
00742 #ifdef CONFIG_SCHED_DEBUG
00743 #define SCHED_FEAT(name, enabled) \
00744 #name ,
00745
00746 static __read_mostly char *sched_feat_names[] = {
00747 #include "sched_features.h"
00748 NULL
00749 };
00750
00751 #undef SCHED_FEAT
00752
00753 static int sched_feat_show(struct seq_file *m, void *v)
00754 {
00755 int i;
00756
00757 for (i = 0; sched_feat_names[i]; i++) {
00758 if (!(sysctl_sched_features & (1UL << i)))
00759 seq_puts(m, "NO_");
00760 seq_printf(m, "%s ", sched_feat_names[i]);
00761 }
00762 seq_puts(m, "\n");
00763
00764 return 0;
00765 }
00766
00767 static ssize_t
00768 sched_feat_write(struct file *filp, const char __user *ubuf,
00769 size_t cnt, loff_t *ppos)
00770 {
00771 char buf[64];
00772 char *cmp = buf;
00773 int neg = 0;
00774 int i;
00775
00776 if (cnt > 63)
00777 cnt = 63;
00778
00779 if (copy_from_user(&buf, ubuf, cnt))
00780 return -EFAULT;
00781
00782 buf[cnt] = 0;
00783
00784 if (strncmp(buf, "NO_", 3) == 0) {
00785 neg = 1;
00786 cmp += 3;
00787 }
00788
00789 for (i = 0; sched_feat_names[i]; i++) {
00790 int len = strlen(sched_feat_names[i]);
00791
00792 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
00793 if (neg)
00794 sysctl_sched_features &= ~(1UL << i);
00795 else
00796 sysctl_sched_features |= (1UL << i);
00797 break;
00798 }
00799 }
00800
00801 if (!sched_feat_names[i])
00802 return -EINVAL;
00803
00804 filp->f_pos += cnt;
00805
00806 return cnt;
00807 }
00808
00809 static int sched_feat_open(struct inode *inode, struct file *filp)
00810 {
00811 return single_open(filp, sched_feat_show, NULL);
00812 }
00813
00814 static struct file_operations sched_feat_fops = {
00815 .open = sched_feat_open,
00816 .write = sched_feat_write,
00817 .read = seq_read,
00818 .llseek = seq_lseek,
00819 .release = single_release,
00820 };
00821
00822 static __init int sched_init_debug(void)
00823 {
00824 debugfs_create_file("sched_features", 0644, NULL, NULL,
00825 &sched_feat_fops);
00826
00827 return 0;
00828 }
00829 late_initcall(sched_init_debug);
00830
00831 #endif
00832
00833 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
00834
00835
00836
00837
00838
00839 const_debug unsigned int sysctl_sched_nr_migrate = 32;
00840
00841
00842
00843
00844
00845 unsigned int sysctl_sched_shares_ratelimit = 250000;
00846
00847
00848
00849
00850
00851
00852 unsigned int sysctl_sched_shares_thresh = 4;
00853
00854
00855
00856
00857
00858 unsigned int sysctl_sched_rt_period = 1000000;
00859
00860 static __read_mostly int scheduler_running;
00861
00862
00863
00864
00865
00866 int sysctl_sched_rt_runtime = 950000;
00867
00868 static inline u64 global_rt_period(void)
00869 {
00870 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
00871 }
00872
00873 static inline u64 global_rt_runtime(void)
00874 {
00875 if (sysctl_sched_rt_runtime < 0)
00876 return RUNTIME_INF;
00877
00878 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
00879 }
00880
00881 #ifndef prepare_arch_switch
00882 # define prepare_arch_switch(next) do { } while (0)
00883 #endif
00884 #ifndef finish_arch_switch
00885 # define finish_arch_switch(prev) do { } while (0)
00886 #endif
00887
00888 static inline int task_current(struct rq *rq, struct task_struct *p)
00889 {
00890 return rq->curr == p;
00891 }
00892
00893 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
00894 static inline int task_running(struct rq *rq, struct task_struct *p)
00895 {
00896 return task_current(rq, p);
00897 }
00898
00899 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
00900 {
00901 }
00902
00903 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
00904 {
00905 #ifdef CONFIG_DEBUG_SPINLOCK
00906
00907 rq->lock.owner = current;
00908 #endif
00909
00910
00911
00912
00913
00914 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
00915
00916 spin_unlock_irq(&rq->lock);
00917 }
00918
00919 #else
00920 static inline int task_running(struct rq *rq, struct task_struct *p)
00921 {
00922 #ifdef CONFIG_SMP
00923 return p->oncpu;
00924 #else
00925 return task_current(rq, p);
00926 #endif
00927 }
00928
00929 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
00930 {
00931 #ifdef CONFIG_SMP
00932
00933
00934
00935
00936
00937 next->oncpu = 1;
00938 #endif
00939 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
00940 spin_unlock_irq(&rq->lock);
00941 #else
00942 spin_unlock(&rq->lock);
00943 #endif
00944 }
00945
00946 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
00947 {
00948 #ifdef CONFIG_SMP
00949
00950
00951
00952
00953
00954 smp_wmb();
00955 prev->oncpu = 0;
00956 #endif
00957 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
00958 local_irq_enable();
00959 #endif
00960 }
00961 #endif
00962
00963
00964
00965
00966
00967 static inline struct rq *__task_rq_lock(struct task_struct *p)
00968 __acquires(rq->lock)
00969 {
00970 for (;;) {
00971 struct rq *rq = task_rq(p);
00972 spin_lock(&rq->lock);
00973 if (likely(rq == task_rq(p)))
00974 return rq;
00975 spin_unlock(&rq->lock);
00976 }
00977 }
00978
00979
00980
00981
00982
00983
00984 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
00985 __acquires(rq->lock)
00986 {
00987 struct rq *rq;
00988
00989 for (;;) {
00990 local_irq_save(*flags);
00991 rq = task_rq(p);
00992 spin_lock(&rq->lock);
00993 if (likely(rq == task_rq(p)))
00994 return rq;
00995 spin_unlock_irqrestore(&rq->lock, *flags);
00996 }
00997 }
00998
00999 void task_rq_unlock_wait(struct task_struct *p)
01000 {
01001 struct rq *rq = task_rq(p);
01002
01003 smp_mb();
01004 spin_unlock_wait(&rq->lock);
01005 }
01006
01007 static void __task_rq_unlock(struct rq *rq)
01008 __releases(rq->lock)
01009 {
01010 spin_unlock(&rq->lock);
01011 }
01012
01013 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
01014 __releases(rq->lock)
01015 {
01016 spin_unlock_irqrestore(&rq->lock, *flags);
01017 }
01018
01019
01020
01021
01022 static struct rq *this_rq_lock(void)
01023 __acquires(rq->lock)
01024 {
01025 struct rq *rq;
01026
01027 local_irq_disable();
01028 rq = this_rq();
01029 spin_lock(&rq->lock);
01030
01031 return rq;
01032 }
01033
01034 #ifdef CONFIG_SCHED_HRTICK
01035
01036
01037
01038
01039
01040
01041
01042
01043
01044
01045
01046
01047
01048
01049
01050
01051 static inline int hrtick_enabled(struct rq *rq)
01052 {
01053 if (!sched_feat(HRTICK))
01054 return 0;
01055 if (!cpu_active(cpu_of(rq)))
01056 return 0;
01057 return hrtimer_is_hres_active(&rq->hrtick_timer);
01058 }
01059
01060 static void hrtick_clear(struct rq *rq)
01061 {
01062 if (hrtimer_active(&rq->hrtick_timer))
01063 hrtimer_cancel(&rq->hrtick_timer);
01064 }
01065
01066
01067
01068
01069
01070 static enum hrtimer_restart hrtick(struct hrtimer *timer)
01071 {
01072 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
01073
01074 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
01075
01076 spin_lock(&rq->lock);
01077 update_rq_clock(rq);
01078 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
01079 spin_unlock(&rq->lock);
01080
01081 return HRTIMER_NORESTART;
01082 }
01083
01084 #ifdef CONFIG_SMP
01085
01086
01087
01088 static void __hrtick_start(void *arg)
01089 {
01090 struct rq *rq = arg;
01091
01092 spin_lock(&rq->lock);
01093 hrtimer_restart(&rq->hrtick_timer);
01094 rq->hrtick_csd_pending = 0;
01095 spin_unlock(&rq->lock);
01096 }
01097
01098
01099
01100
01101
01102
01103 static void hrtick_start(struct rq *rq, u64 delay)
01104 {
01105 struct hrtimer *timer = &rq->hrtick_timer;
01106 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
01107
01108 hrtimer_set_expires(timer, time);
01109
01110 if (rq == this_rq()) {
01111 hrtimer_restart(timer);
01112 } else if (!rq->hrtick_csd_pending) {
01113 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
01114 rq->hrtick_csd_pending = 1;
01115 }
01116 }
01117
01118 static int
01119 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
01120 {
01121 int cpu = (int)(long)hcpu;
01122
01123 switch (action) {
01124 case CPU_UP_CANCELED:
01125 case CPU_UP_CANCELED_FROZEN:
01126 case CPU_DOWN_PREPARE:
01127 case CPU_DOWN_PREPARE_FROZEN:
01128 case CPU_DEAD:
01129 case CPU_DEAD_FROZEN:
01130 hrtick_clear(cpu_rq(cpu));
01131 return NOTIFY_OK;
01132 }
01133
01134 return NOTIFY_DONE;
01135 }
01136
01137 static __init void init_hrtick(void)
01138 {
01139 hotcpu_notifier(hotplug_hrtick, 0);
01140 }
01141 #else
01142
01143
01144
01145
01146
01147 static void hrtick_start(struct rq *rq, u64 delay)
01148 {
01149 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
01150 }
01151
01152 static inline void init_hrtick(void)
01153 {
01154 }
01155 #endif
01156
01157 static void init_rq_hrtick(struct rq *rq)
01158 {
01159 #ifdef CONFIG_SMP
01160 rq->hrtick_csd_pending = 0;
01161
01162 rq->hrtick_csd.flags = 0;
01163 rq->hrtick_csd.func = __hrtick_start;
01164 rq->hrtick_csd.info = rq;
01165 #endif
01166
01167 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
01168 rq->hrtick_timer.function = hrtick;
01169 }
01170 #else
01171 static inline void hrtick_clear(struct rq *rq)
01172 {
01173 }
01174
01175 static inline void init_rq_hrtick(struct rq *rq)
01176 {
01177 }
01178
01179 static inline void init_hrtick(void)
01180 {
01181 }
01182 #endif
01183
01184
01185
01186
01187
01188
01189
01190
01191 #ifdef CONFIG_SMP
01192
01193 #ifndef tsk_is_polling
01194 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
01195 #endif
01196
01197 static void resched_task(struct task_struct *p)
01198 {
01199 int cpu;
01200
01201 assert_spin_locked(&task_rq(p)->lock);
01202
01203 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
01204 return;
01205
01206 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
01207
01208 cpu = task_cpu(p);
01209 if (cpu == smp_processor_id())
01210 return;
01211
01212
01213 smp_mb();
01214 if (!tsk_is_polling(p))
01215 smp_send_reschedule(cpu);
01216 }
01217
01218 static void resched_cpu(int cpu)
01219 {
01220 struct rq *rq = cpu_rq(cpu);
01221 unsigned long flags;
01222
01223 if (!spin_trylock_irqsave(&rq->lock, flags))
01224 return;
01225 resched_task(cpu_curr(cpu));
01226 spin_unlock_irqrestore(&rq->lock, flags);
01227 }
01228
01229 #ifdef CONFIG_NO_HZ
01230
01231
01232
01233
01234
01235
01236
01237
01238
01239
01240 void wake_up_idle_cpu(int cpu)
01241 {
01242 struct rq *rq = cpu_rq(cpu);
01243
01244 if (cpu == smp_processor_id())
01245 return;
01246
01247
01248
01249
01250
01251
01252
01253
01254 if (rq->curr != rq->idle)
01255 return;
01256
01257
01258
01259
01260
01261
01262 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
01263
01264
01265 smp_mb();
01266 if (!tsk_is_polling(rq->idle))
01267 smp_send_reschedule(cpu);
01268 }
01269 #endif
01270
01271 #else
01272 static void resched_task(struct task_struct *p)
01273 {
01274 assert_spin_locked(&task_rq(p)->lock);
01275 set_tsk_need_resched(p);
01276 }
01277 #endif
01278
01279 #if BITS_PER_LONG == 32
01280 # define WMULT_CONST (~0UL)
01281 #else
01282 # define WMULT_CONST (1UL << 32)
01283 #endif
01284
01285 #define WMULT_SHIFT 32
01286
01287
01288
01289
01290 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
01291
01292
01293
01294
01295 static unsigned long
01296 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
01297 struct load_weight *lw)
01298 {
01299 u64 tmp;
01300
01301 if (!lw->inv_weight) {
01302 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
01303 lw->inv_weight = 1;
01304 else
01305 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
01306 / (lw->weight+1);
01307 }
01308
01309 tmp = (u64)delta_exec * weight;
01310
01311
01312
01313 if (unlikely(tmp > WMULT_CONST))
01314 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
01315 WMULT_SHIFT/2);
01316 else
01317 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
01318
01319 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
01320 }
01321
01322 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
01323 {
01324 lw->weight += inc;
01325 lw->inv_weight = 0;
01326 }
01327
01328 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
01329 {
01330 lw->weight -= dec;
01331 lw->inv_weight = 0;
01332 }
01333
01334
01335
01336
01337
01338
01339
01340
01341
01342
01343 #define WEIGHT_IDLEPRIO 3
01344 #define WMULT_IDLEPRIO 1431655765
01345
01346
01347
01348
01349
01350
01351
01352
01353
01354
01355
01356
01357
01358 static const int prio_to_weight[40] = {
01359 88761, 71755, 56483, 46273, 36291,
01360 29154, 23254, 18705, 14949, 11916,
01361 9548, 7620, 6100, 4904, 3906,
01362 3121, 2501, 1991, 1586, 1277,
01363 1024, 820, 655, 526, 423,
01364 335, 272, 215, 172, 137,
01365 110, 87, 70, 56, 45,
01366 36, 29, 23, 18, 15,
01367 };
01368
01369
01370
01371
01372
01373
01374
01375
01376 static const u32 prio_to_wmult[40] = {
01377 48388, 59856, 76040, 92818, 118348,
01378 147320, 184698, 229616, 287308, 360437,
01379 449829, 563644, 704093, 875809, 1099582,
01380 1376151, 1717300, 2157191, 2708050, 3363326,
01381 4194304, 5237765, 6557202, 8165337, 10153587,
01382 12820798, 15790321, 19976592, 24970740, 31350126,
01383 39045157, 49367440, 61356676, 76695844, 95443717,
01384 119304647, 148102320, 186737708, 238609294, 286331153,
01385 };
01386
01387 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
01388
01389
01390
01391
01392
01393
01394 struct rq_iterator {
01395 void *arg;
01396 struct task_struct *(*start)(void *);
01397 struct task_struct *(*next)(void *);
01398 };
01399
01400 #ifdef CONFIG_SMP
01401 static unsigned long
01402 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
01403 unsigned long max_load_move, struct sched_domain *sd,
01404 enum cpu_idle_type idle, int *all_pinned,
01405 int *this_best_prio, struct rq_iterator *iterator);
01406
01407 static int
01408 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
01409 struct sched_domain *sd, enum cpu_idle_type idle,
01410 struct rq_iterator *iterator);
01411 #endif
01412
01413 #ifdef CONFIG_CGROUP_CPUACCT
01414 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
01415 #else
01416 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
01417 #endif
01418
01419 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
01420 {
01421 update_load_add(&rq->load, load);
01422 }
01423
01424 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
01425 {
01426 update_load_sub(&rq->load, load);
01427 }
01428
01429 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
01430 typedef int (*tg_visitor)(struct task_group *, void *);
01431
01432
01433
01434
01435
01436 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
01437 {
01438 struct task_group *parent, *child;
01439 int ret;
01440
01441 rcu_read_lock();
01442 parent = &root_task_group;
01443 down:
01444 ret = (*down)(parent, data);
01445 if (ret)
01446 goto out_unlock;
01447 list_for_each_entry_rcu(child, &parent->children, siblings) {
01448 parent = child;
01449 goto down;
01450
01451 up:
01452 continue;
01453 }
01454 ret = (*up)(parent, data);
01455 if (ret)
01456 goto out_unlock;
01457
01458 child = parent;
01459 parent = parent->parent;
01460 if (parent)
01461 goto up;
01462 out_unlock:
01463 rcu_read_unlock();
01464
01465 return ret;
01466 }
01467
01468 static int tg_nop(struct task_group *tg, void *data)
01469 {
01470 return 0;
01471 }
01472 #endif
01473
01474 #ifdef CONFIG_SMP
01475 static unsigned long source_load(int cpu, int type);
01476 static unsigned long target_load(int cpu, int type);
01477 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
01478
01479 static unsigned long cpu_avg_load_per_task(int cpu)
01480 {
01481 struct rq *rq = cpu_rq(cpu);
01482 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
01483
01484 if (nr_running)
01485 rq->avg_load_per_task = rq->load.weight / nr_running;
01486 else
01487 rq->avg_load_per_task = 0;
01488
01489 return rq->avg_load_per_task;
01490 }
01491
01492 #ifdef CONFIG_FAIR_GROUP_SCHED
01493
01494 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
01495
01496
01497
01498
01499 static void
01500 update_group_shares_cpu(struct task_group *tg, int cpu,
01501 unsigned long sd_shares, unsigned long sd_rq_weight)
01502 {
01503 unsigned long shares;
01504 unsigned long rq_weight;
01505
01506 if (!tg->se[cpu])
01507 return;
01508
01509 rq_weight = tg->cfs_rq[cpu]->rq_weight;
01510
01511
01512
01513
01514
01515
01516
01517 shares = (sd_shares * rq_weight) / sd_rq_weight;
01518 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
01519
01520 if (abs(shares - tg->se[cpu]->load.weight) >
01521 sysctl_sched_shares_thresh) {
01522 struct rq *rq = cpu_rq(cpu);
01523 unsigned long flags;
01524
01525 spin_lock_irqsave(&rq->lock, flags);
01526 tg->cfs_rq[cpu]->shares = shares;
01527
01528 __set_se_shares(tg->se[cpu], shares);
01529 spin_unlock_irqrestore(&rq->lock, flags);
01530 }
01531 }
01532
01533
01534
01535
01536
01537
01538 static int tg_shares_up(struct task_group *tg, void *data)
01539 {
01540 unsigned long weight, rq_weight = 0;
01541 unsigned long shares = 0;
01542 struct sched_domain *sd = data;
01543 int i;
01544
01545 for_each_cpu(i, sched_domain_span(sd)) {
01546
01547
01548
01549
01550
01551 weight = tg->cfs_rq[i]->load.weight;
01552 if (!weight)
01553 weight = NICE_0_LOAD;
01554
01555 tg->cfs_rq[i]->rq_weight = weight;
01556 rq_weight += weight;
01557 shares += tg->cfs_rq[i]->shares;
01558 }
01559
01560 if ((!shares && rq_weight) || shares > tg->shares)
01561 shares = tg->shares;
01562
01563 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
01564 shares = tg->shares;
01565
01566 for_each_cpu(i, sched_domain_span(sd))
01567 update_group_shares_cpu(tg, i, shares, rq_weight);
01568
01569 return 0;
01570 }
01571
01572
01573
01574
01575
01576
01577 static int tg_load_down(struct task_group *tg, void *data)
01578 {
01579 unsigned long load;
01580 long cpu = (long)data;
01581
01582 if (!tg->parent) {
01583 load = cpu_rq(cpu)->load.weight;
01584 } else {
01585 load = tg->parent->cfs_rq[cpu]->h_load;
01586 load *= tg->cfs_rq[cpu]->shares;
01587 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
01588 }
01589
01590 tg->cfs_rq[cpu]->h_load = load;
01591
01592 return 0;
01593 }
01594
01595 static void update_shares(struct sched_domain *sd)
01596 {
01597 u64 now = cpu_clock(raw_smp_processor_id());
01598 s64 elapsed = now - sd->last_update;
01599
01600 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
01601 sd->last_update = now;
01602 walk_tg_tree(tg_nop, tg_shares_up, sd);
01603 }
01604 }
01605
01606 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
01607 {
01608 spin_unlock(&rq->lock);
01609 update_shares(sd);
01610 spin_lock(&rq->lock);
01611 }
01612
01613 static void update_h_load(long cpu)
01614 {
01615 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
01616 }
01617
01618 #else
01619
01620 static inline void update_shares(struct sched_domain *sd)
01621 {
01622 }
01623
01624 static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
01625 {
01626 }
01627
01628 #endif
01629
01630
01631
01632
01633 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
01634 __releases(this_rq->lock)
01635 __acquires(busiest->lock)
01636 __acquires(this_rq->lock)
01637 {
01638 int ret = 0;
01639
01640 if (unlikely(!irqs_disabled())) {
01641
01642 spin_unlock(&this_rq->lock);
01643 BUG_ON(1);
01644 }
01645 if (unlikely(!spin_trylock(&busiest->lock))) {
01646 if (busiest < this_rq) {
01647 spin_unlock(&this_rq->lock);
01648 spin_lock(&busiest->lock);
01649 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
01650 ret = 1;
01651 } else
01652 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
01653 }
01654 return ret;
01655 }
01656
01657 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
01658 __releases(busiest->lock)
01659 {
01660 spin_unlock(&busiest->lock);
01661 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
01662 }
01663 #endif
01664
01665 #ifdef CONFIG_FAIR_GROUP_SCHED
01666 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
01667 {
01668 #ifdef CONFIG_SMP
01669 cfs_rq->shares = shares;
01670 #endif
01671 }
01672 #endif
01673
01674 #include "sched_stats.h"
01675 #include "sched_idletask.c"
01676 #include "sched_fair.c"
01677 #include "sched_rt.c"
01678 #ifdef CONFIG_SCHED_DEBUG
01679 # include "sched_debug.c"
01680 #endif
01681
01682 #define sched_class_highest (&rt_sched_class)
01683 #define for_each_class(class) \
01684 for (class = sched_class_highest; class; class = class->next)
01685
01686 static void inc_nr_running(struct rq *rq)
01687 {
01688 rq->nr_running++;
01689 }
01690
01691 static void dec_nr_running(struct rq *rq)
01692 {
01693 rq->nr_running--;
01694 }
01695
01696 static void set_load_weight(struct task_struct *p)
01697 {
01698 if (task_has_rt_policy(p)) {
01699 p->se.load.weight = prio_to_weight[0] * 2;
01700 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
01701 return;
01702 }
01703
01704
01705
01706
01707 if (p->policy == SCHED_IDLE) {
01708 p->se.load.weight = WEIGHT_IDLEPRIO;
01709 p->se.load.inv_weight = WMULT_IDLEPRIO;
01710 return;
01711 }
01712
01713 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
01714 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
01715 }
01716
01717 static void update_avg(u64 *avg, u64 sample)
01718 {
01719 s64 diff = sample - *avg;
01720 *avg += diff >> 3;
01721 }
01722
01723 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
01724 {
01725 sched_info_queued(p);
01726 p->sched_class->enqueue_task(rq, p, wakeup);
01727 p->se.on_rq = 1;
01728 }
01729
01730 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
01731 {
01732 if (sleep && p->se.last_wakeup) {
01733 update_avg(&p->se.avg_overlap,
01734 p->se.sum_exec_runtime - p->se.last_wakeup);
01735 p->se.last_wakeup = 0;
01736 }
01737
01738 sched_info_dequeued(p);
01739 p->sched_class->dequeue_task(rq, p, sleep);
01740 p->se.on_rq = 0;
01741 }
01742
01743
01744
01745
01746 static inline int __normal_prio(struct task_struct *p)
01747 {
01748 return p->static_prio;
01749 }
01750
01751
01752
01753
01754
01755
01756
01757
01758 static inline int normal_prio(struct task_struct *p)
01759 {
01760 int prio;
01761
01762 if (task_has_rt_policy(p))
01763 prio = MAX_RT_PRIO-1 - p->rt_priority;
01764 else
01765 prio = __normal_prio(p);
01766 return prio;
01767 }
01768
01769
01770
01771
01772
01773
01774
01775
01776 static int effective_prio(struct task_struct *p)
01777 {
01778 p->normal_prio = normal_prio(p);
01779
01780
01781
01782
01783
01784 if (!rt_prio(p->prio))
01785 return p->normal_prio;
01786 return p->prio;
01787 }
01788
01789
01790
01791
01792 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
01793 {
01794 if (task_contributes_to_load(p))
01795 rq->nr_uninterruptible--;
01796
01797 enqueue_task(rq, p, wakeup);
01798 inc_nr_running(rq);
01799 }
01800
01801
01802
01803
01804 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
01805 {
01806 if (task_contributes_to_load(p))
01807 rq->nr_uninterruptible++;
01808
01809 dequeue_task(rq, p, sleep);
01810 dec_nr_running(rq);
01811 }
01812
01813
01814
01815
01816
01817 inline int task_curr(const struct task_struct *p)
01818 {
01819 return cpu_curr(task_cpu(p)) == p;
01820 }
01821
01822 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
01823 {
01824 set_task_rq(p, cpu);
01825 #ifdef CONFIG_SMP
01826
01827
01828
01829
01830
01831 smp_wmb();
01832 task_thread_info(p)->cpu = cpu;
01833 #endif
01834 }
01835
01836 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
01837 const struct sched_class *prev_class,
01838 int oldprio, int running)
01839 {
01840 if (prev_class != p->sched_class) {
01841 if (prev_class->switched_from)
01842 prev_class->switched_from(rq, p, running);
01843 p->sched_class->switched_to(rq, p, running);
01844 } else
01845 p->sched_class->prio_changed(rq, p, oldprio, running);
01846 }
01847
01848 #ifdef CONFIG_SMP
01849
01850
01851 static unsigned long weighted_cpuload(const int cpu)
01852 {
01853 return cpu_rq(cpu)->load.weight;
01854 }
01855
01856
01857
01858
01859 static int
01860 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
01861 {
01862 s64 delta;
01863
01864
01865
01866
01867 if (sched_feat(CACHE_HOT_BUDDY) &&
01868 (&p->se == cfs_rq_of(&p->se)->next ||
01869 &p->se == cfs_rq_of(&p->se)->last))
01870 return 1;
01871
01872 if (p->sched_class != &fair_sched_class)
01873 return 0;
01874
01875 if (sysctl_sched_migration_cost == -1)
01876 return 1;
01877 if (sysctl_sched_migration_cost == 0)
01878 return 0;
01879
01880 delta = now - p->se.exec_start;
01881
01882 return delta < (s64)sysctl_sched_migration_cost;
01883 }
01884
01885
01886 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
01887 {
01888 int old_cpu = task_cpu(p);
01889 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
01890 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
01891 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
01892 u64 clock_offset;
01893
01894 clock_offset = old_rq->clock - new_rq->clock;
01895
01896 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
01897
01898 #ifdef CONFIG_SCHEDSTATS
01899 if (p->se.wait_start)
01900 p->se.wait_start -= clock_offset;
01901 if (p->se.sleep_start)
01902 p->se.sleep_start -= clock_offset;
01903 if (p->se.block_start)
01904 p->se.block_start -= clock_offset;
01905 if (old_cpu != new_cpu) {
01906 schedstat_inc(p, se.nr_migrations);
01907 if (task_hot(p, old_rq->clock, NULL))
01908 schedstat_inc(p, se.nr_forced2_migrations);
01909 }
01910 #endif
01911 p->se.vruntime -= old_cfsrq->min_vruntime -
01912 new_cfsrq->min_vruntime;
01913
01914 __set_task_cpu(p, new_cpu);
01915 }
01916
01917 struct migration_req {
01918 struct list_head list;
01919
01920 struct task_struct *task;
01921 int dest_cpu;
01922
01923 struct completion done;
01924 };
01925
01926
01927
01928
01929
01930 static int
01931 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
01932 {
01933 struct rq *rq = task_rq(p);
01934
01935
01936
01937
01938
01939 if (!p->se.on_rq && !task_running(rq, p)) {
01940 set_task_cpu(p, dest_cpu);
01941 return 0;
01942 }
01943
01944 init_completion(&req->done);
01945 req->task = p;
01946 req->dest_cpu = dest_cpu;
01947 list_add(&req->list, &rq->migration_queue);
01948
01949 return 1;
01950 }
01951
01952
01953
01954
01955
01956
01957
01958
01959
01960
01961
01962
01963
01964
01965
01966
01967
01968 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
01969 {
01970 unsigned long flags;
01971 int running, on_rq;
01972 unsigned long ncsw;
01973 struct rq *rq;
01974
01975 for (;;) {
01976
01977
01978
01979
01980
01981
01982 rq = task_rq(p);
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995 while (task_running(rq, p)) {
01996 if (match_state && unlikely(p->state != match_state))
01997 return 0;
01998 cpu_relax();
01999 }
02000
02001
02002
02003
02004
02005
02006 rq = task_rq_lock(p, &flags);
02007 trace_sched_wait_task(rq, p);
02008 running = task_running(rq, p);
02009 on_rq = p->se.on_rq;
02010 ncsw = 0;
02011 if (!match_state || p->state == match_state)
02012 ncsw = p->nvcsw | LONG_MIN;
02013 task_rq_unlock(rq, &flags);
02014
02015
02016
02017
02018 if (unlikely(!ncsw))
02019 break;
02020
02021
02022
02023
02024
02025
02026
02027 if (unlikely(running)) {
02028 cpu_relax();
02029 continue;
02030 }
02031
02032
02033
02034
02035
02036
02037
02038
02039
02040
02041 if (unlikely(on_rq)) {
02042 schedule_timeout_uninterruptible(1);
02043 continue;
02044 }
02045
02046
02047
02048
02049
02050
02051 break;
02052 }
02053
02054 return ncsw;
02055 }
02056
02057
02058
02059
02060
02061
02062
02063
02064
02065
02066
02067
02068
02069
02070 void kick_process(struct task_struct *p)
02071 {
02072 int cpu;
02073
02074 preempt_disable();
02075 cpu = task_cpu(p);
02076 if ((cpu != smp_processor_id()) && task_curr(p))
02077 smp_send_reschedule(cpu);
02078 preempt_enable();
02079 }
02080
02081
02082
02083
02084
02085
02086
02087
02088 static unsigned long source_load(int cpu, int type)
02089 {
02090 struct rq *rq = cpu_rq(cpu);
02091 unsigned long total = weighted_cpuload(cpu);
02092
02093 if (type == 0 || !sched_feat(LB_BIAS))
02094 return total;
02095
02096 return min(rq->cpu_load[type-1], total);
02097 }
02098
02099
02100
02101
02102
02103 static unsigned long target_load(int cpu, int type)
02104 {
02105 struct rq *rq = cpu_rq(cpu);
02106 unsigned long total = weighted_cpuload(cpu);
02107
02108 if (type == 0 || !sched_feat(LB_BIAS))
02109 return total;
02110
02111 return max(rq->cpu_load[type-1], total);
02112 }
02113
02114
02115
02116
02117
02118 static struct sched_group *
02119 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
02120 {
02121 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
02122 unsigned long min_load = ULONG_MAX, this_load = 0;
02123 int load_idx = sd->forkexec_idx;
02124 int imbalance = 100 + (sd->imbalance_pct-100)/2;
02125
02126 do {
02127 unsigned long load, avg_load;
02128 int local_group;
02129 int i;
02130
02131
02132 if (!cpumask_intersects(sched_group_cpus(group),
02133 &p->cpus_allowed))
02134 continue;
02135
02136 local_group = cpumask_test_cpu(this_cpu,
02137 sched_group_cpus(group));
02138
02139
02140 avg_load = 0;
02141
02142 for_each_cpu(i, sched_group_cpus(group)) {
02143
02144 if (local_group)
02145 load = source_load(i, load_idx);
02146 else
02147 load = target_load(i, load_idx);
02148
02149 avg_load += load;
02150 }
02151
02152
02153 avg_load = sg_div_cpu_power(group,
02154 avg_load * SCHED_LOAD_SCALE);
02155
02156 if (local_group) {
02157 this_load = avg_load;
02158 this = group;
02159 } else if (avg_load < min_load) {
02160 min_load = avg_load;
02161 idlest = group;
02162 }
02163 } while (group = group->next, group != sd->groups);
02164
02165 if (!idlest || 100*this_load < imbalance*min_load)
02166 return NULL;
02167 return idlest;
02168 }
02169
02170
02171
02172
02173 static int
02174 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
02175 {
02176 unsigned long load, min_load = ULONG_MAX;
02177 int idlest = -1;
02178 int i;
02179
02180
02181 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
02182 load = weighted_cpuload(i);
02183
02184 if (load < min_load || (load == min_load && i == this_cpu)) {
02185 min_load = load;
02186 idlest = i;
02187 }
02188 }
02189
02190 return idlest;
02191 }
02192
02193
02194
02195
02196
02197
02198
02199
02200
02201
02202
02203
02204 static int sched_balance_self(int cpu, int flag)
02205 {
02206 struct task_struct *t = current;
02207 struct sched_domain *tmp, *sd = NULL;
02208
02209 for_each_domain(cpu, tmp) {
02210
02211
02212
02213 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
02214 break;
02215 if (tmp->flags & flag)
02216 sd = tmp;
02217 }
02218
02219 if (sd)
02220 update_shares(sd);
02221
02222 while (sd) {
02223 struct sched_group *group;
02224 int new_cpu, weight;
02225
02226 if (!(sd->flags & flag)) {
02227 sd = sd->child;
02228 continue;
02229 }
02230
02231 group = find_idlest_group(sd, t, cpu);
02232 if (!group) {
02233 sd = sd->child;
02234 continue;
02235 }
02236
02237 new_cpu = find_idlest_cpu(group, t, cpu);
02238 if (new_cpu == -1 || new_cpu == cpu) {
02239
02240 sd = sd->child;
02241 continue;
02242 }
02243
02244
02245 cpu = new_cpu;
02246 weight = cpumask_weight(sched_domain_span(sd));
02247 sd = NULL;
02248 for_each_domain(cpu, tmp) {
02249 if (weight <= cpumask_weight(sched_domain_span(tmp)))
02250 break;
02251 if (tmp->flags & flag)
02252 sd = tmp;
02253 }
02254
02255 }
02256
02257 return cpu;
02258 }
02259
02260 #endif
02261
02262
02263
02264
02265
02266
02267
02268
02269
02270
02271
02272
02273
02274
02275
02276 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
02277 {
02278 int cpu, orig_cpu, this_cpu, success = 0;
02279 unsigned long flags;
02280 long old_state;
02281 struct rq *rq;
02282
02283 if (!sched_feat(SYNC_WAKEUPS))
02284 sync = 0;
02285
02286 #ifdef CONFIG_SMP
02287 if (sched_feat(LB_WAKEUP_UPDATE)) {
02288 struct sched_domain *sd;
02289
02290 this_cpu = raw_smp_processor_id();
02291 cpu = task_cpu(p);
02292
02293 for_each_domain(this_cpu, sd) {
02294 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
02295 update_shares(sd);
02296 break;
02297 }
02298 }
02299 }
02300 #endif
02301
02302 smp_wmb();
02303 rq = task_rq_lock(p, &flags);
02304 update_rq_clock(rq);
02305 old_state = p->state;
02306 if (!(old_state & state))
02307 goto out;
02308
02309 if (p->se.on_rq)
02310 goto out_running;
02311
02312 cpu = task_cpu(p);
02313 orig_cpu = cpu;
02314 this_cpu = smp_processor_id();
02315
02316 #ifdef CONFIG_SMP
02317 if (unlikely(task_running(rq, p)))
02318 goto out_activate;
02319
02320 cpu = p->sched_class->select_task_rq(p, sync);
02321 if (cpu != orig_cpu) {
02322 set_task_cpu(p, cpu);
02323 task_rq_unlock(rq, &flags);
02324
02325 rq = task_rq_lock(p, &flags);
02326 old_state = p->state;
02327 if (!(old_state & state))
02328 goto out;
02329 if (p->se.on_rq)
02330 goto out_running;
02331
02332 this_cpu = smp_processor_id();
02333 cpu = task_cpu(p);
02334 }
02335
02336 #ifdef CONFIG_SCHEDSTATS
02337 schedstat_inc(rq, ttwu_count);
02338 if (cpu == this_cpu)
02339 schedstat_inc(rq, ttwu_local);
02340 else {
02341 struct sched_domain *sd;
02342 for_each_domain(this_cpu, sd) {
02343 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
02344 schedstat_inc(sd, ttwu_wake_remote);
02345 break;
02346 }
02347 }
02348 }
02349 #endif
02350
02351 out_activate:
02352 #endif
02353 schedstat_inc(p, se.nr_wakeups);
02354 if (sync)
02355 schedstat_inc(p, se.nr_wakeups_sync);
02356 if (orig_cpu != cpu)
02357 schedstat_inc(p, se.nr_wakeups_migrate);
02358 if (cpu == this_cpu)
02359 schedstat_inc(p, se.nr_wakeups_local);
02360 else
02361 schedstat_inc(p, se.nr_wakeups_remote);
02362 activate_task(rq, p, 1);
02363 success = 1;
02364
02365 out_running:
02366 trace_sched_wakeup(rq, p, success);
02367 check_preempt_curr(rq, p, sync);
02368
02369 p->state = TASK_RUNNING;
02370 #ifdef CONFIG_SMP
02371 if (p->sched_class->task_wake_up)
02372 p->sched_class->task_wake_up(rq, p);
02373 #endif
02374 out:
02375 current->se.last_wakeup = current->se.sum_exec_runtime;
02376
02377 task_rq_unlock(rq, &flags);
02378
02379 return success;
02380 }
02381 #endif
02382
02383 int wake_up_process(struct task_struct *p)
02384 {
02385 return try_to_wake_up(p, TASK_ALL, 0);
02386 }
02387 EXPORT_SYMBOL(wake_up_process);
02388
02389 int wake_up_state(struct task_struct *p, unsigned int state)
02390 {
02391 return try_to_wake_up(p, state, 0);
02392 }
02393
02394 #ifndef DDE_LINUX
02395
02396
02397
02398
02399
02400
02401 static void __sched_fork(struct task_struct *p)
02402 {
02403 p->se.exec_start = 0;
02404 p->se.sum_exec_runtime = 0;
02405 p->se.prev_sum_exec_runtime = 0;
02406 p->se.last_wakeup = 0;
02407 p->se.avg_overlap = 0;
02408
02409 #ifdef CONFIG_SCHEDSTATS
02410 p->se.wait_start = 0;
02411 p->se.sum_sleep_runtime = 0;
02412 p->se.sleep_start = 0;
02413 p->se.block_start = 0;
02414 p->se.sleep_max = 0;
02415 p->se.block_max = 0;
02416 p->se.exec_max = 0;
02417 p->se.slice_max = 0;
02418 p->se.wait_max = 0;
02419 #endif
02420
02421 INIT_LIST_HEAD(&p->rt.run_list);
02422 p->se.on_rq = 0;
02423 INIT_LIST_HEAD(&p->se.group_node);
02424
02425 #ifdef CONFIG_PREEMPT_NOTIFIERS
02426 INIT_HLIST_HEAD(&p->preempt_notifiers);
02427 #endif
02428
02429
02430
02431
02432
02433
02434
02435 p->state = TASK_RUNNING;
02436 }
02437
02438
02439
02440
02441 void sched_fork(struct task_struct *p, int clone_flags)
02442 {
02443 int cpu = get_cpu();
02444
02445 __sched_fork(p);
02446
02447 #ifdef CONFIG_SMP
02448 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
02449 #endif
02450 set_task_cpu(p, cpu);
02451
02452
02453
02454
02455 p->prio = current->normal_prio;
02456 if (!rt_prio(p->prio))
02457 p->sched_class = &fair_sched_class;
02458
02459 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
02460 if (likely(sched_info_on()))
02461 memset(&p->sched_info, 0, sizeof(p->sched_info));
02462 #endif
02463 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
02464 p->oncpu = 0;
02465 #endif
02466 #ifdef CONFIG_PREEMPT
02467
02468 task_thread_info(p)->preempt_count = 1;
02469 #endif
02470 put_cpu();
02471 }
02472
02473
02474
02475
02476
02477
02478
02479
02480 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
02481 {
02482 unsigned long flags;
02483 struct rq *rq;
02484
02485 rq = task_rq_lock(p, &flags);
02486 BUG_ON(p->state != TASK_RUNNING);
02487 update_rq_clock(rq);
02488
02489 p->prio = effective_prio(p);
02490
02491 if (!p->sched_class->task_new || !current->se.on_rq) {
02492 activate_task(rq, p, 0);
02493 } else {
02494
02495
02496
02497
02498 p->sched_class->task_new(rq, p);
02499 inc_nr_running(rq);
02500 }
02501 trace_sched_wakeup_new(rq, p, 1);
02502 check_preempt_curr(rq, p, 0);
02503 #ifdef CONFIG_SMP
02504 if (p->sched_class->task_wake_up)
02505 p->sched_class->task_wake_up(rq, p);
02506 #endif
02507 task_rq_unlock(rq, &flags);
02508 }
02509
02510 #ifdef CONFIG_PREEMPT_NOTIFIERS
02511
02512
02513
02514
02515
02516 void preempt_notifier_register(struct preempt_notifier *notifier)
02517 {
02518 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
02519 }
02520 EXPORT_SYMBOL_GPL(preempt_notifier_register);
02521
02522
02523
02524
02525
02526
02527
02528 void preempt_notifier_unregister(struct preempt_notifier *notifier)
02529 {
02530 hlist_del(¬ifier->link);
02531 }
02532 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
02533
02534 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
02535 {
02536 struct preempt_notifier *notifier;
02537 struct hlist_node *node;
02538
02539 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
02540 notifier->ops->sched_in(notifier, raw_smp_processor_id());
02541 }
02542
02543 static void
02544 fire_sched_out_preempt_notifiers(struct task_struct *curr,
02545 struct task_struct *next)
02546 {
02547 struct preempt_notifier *notifier;
02548 struct hlist_node *node;
02549
02550 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
02551 notifier->ops->sched_out(notifier, next);
02552 }
02553
02554 #else
02555
02556 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
02557 {
02558 }
02559
02560 static void
02561 fire_sched_out_preempt_notifiers(struct task_struct *curr,
02562 struct task_struct *next)
02563 {
02564 }
02565
02566 #endif
02567
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581 static inline void
02582 prepare_task_switch(struct rq *rq, struct task_struct *prev,
02583 struct task_struct *next)
02584 {
02585 fire_sched_out_preempt_notifiers(prev, next);
02586 prepare_lock_switch(rq, next);
02587 prepare_arch_switch(next);
02588 }
02589
02590
02591
02592
02593
02594
02595
02596
02597
02598
02599
02600
02601
02602
02603
02604
02605 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
02606 __releases(rq->lock)
02607 {
02608 struct mm_struct *mm = rq->prev_mm;
02609 long prev_state;
02610
02611 rq->prev_mm = NULL;
02612
02613
02614
02615
02616
02617
02618
02619
02620
02621
02622
02623
02624 prev_state = prev->state;
02625 finish_arch_switch(prev);
02626 finish_lock_switch(rq, prev);
02627 #ifdef CONFIG_SMP
02628 if (current->sched_class->post_schedule)
02629 current->sched_class->post_schedule(rq);
02630 #endif
02631
02632 fire_sched_in_preempt_notifiers(current);
02633 if (mm)
02634 mmdrop(mm);
02635 if (unlikely(prev_state == TASK_DEAD)) {
02636
02637
02638
02639
02640 kprobe_flush_task(prev);
02641 put_task_struct(prev);
02642 }
02643 }
02644
02645
02646
02647
02648
02649 asmlinkage void schedule_tail(struct task_struct *prev)
02650 __releases(rq->lock)
02651 {
02652 struct rq *rq = this_rq();
02653
02654 finish_task_switch(rq, prev);
02655 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
02656
02657 preempt_enable();
02658 #endif
02659 if (current->set_child_tid)
02660 put_user(task_pid_vnr(current), current->set_child_tid);
02661 }
02662
02663
02664
02665
02666
02667 static inline void
02668 context_switch(struct rq *rq, struct task_struct *prev,
02669 struct task_struct *next)
02670 {
02671 struct mm_struct *mm, *oldmm;
02672
02673 prepare_task_switch(rq, prev, next);
02674 trace_sched_switch(rq, prev, next);
02675 mm = next->mm;
02676 oldmm = prev->active_mm;
02677
02678
02679
02680
02681
02682 arch_enter_lazy_cpu_mode();
02683
02684 if (unlikely(!mm)) {
02685 next->active_mm = oldmm;
02686 atomic_inc(&oldmm->mm_count);
02687 enter_lazy_tlb(oldmm, next);
02688 } else
02689 switch_mm(oldmm, mm, next);
02690
02691 if (unlikely(!prev->mm)) {
02692 prev->active_mm = NULL;
02693 rq->prev_mm = oldmm;
02694 }
02695
02696
02697
02698
02699
02700
02701 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
02702 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
02703 #endif
02704
02705
02706 switch_to(prev, next, prev);
02707
02708 barrier();
02709
02710
02711
02712
02713
02714 finish_task_switch(this_rq(), prev);
02715 }
02716
02717
02718
02719
02720
02721
02722
02723
02724 unsigned long nr_running(void)
02725 {
02726 unsigned long i, sum = 0;
02727
02728 for_each_online_cpu(i)
02729 sum += cpu_rq(i)->nr_running;
02730
02731 return sum;
02732 }
02733
02734 unsigned long nr_uninterruptible(void)
02735 {
02736 unsigned long i, sum = 0;
02737
02738 for_each_possible_cpu(i)
02739 sum += cpu_rq(i)->nr_uninterruptible;
02740
02741
02742
02743
02744
02745 if (unlikely((long)sum < 0))
02746 sum = 0;
02747
02748 return sum;
02749 }
02750
02751 unsigned long long nr_context_switches(void)
02752 {
02753 int i;
02754 unsigned long long sum = 0;
02755
02756 for_each_possible_cpu(i)
02757 sum += cpu_rq(i)->nr_switches;
02758
02759 return sum;
02760 }
02761
02762 unsigned long nr_iowait(void)
02763 {
02764 unsigned long i, sum = 0;
02765
02766 for_each_possible_cpu(i)
02767 sum += atomic_read(&cpu_rq(i)->nr_iowait);
02768
02769 return sum;
02770 }
02771
02772 unsigned long nr_active(void)
02773 {
02774 unsigned long i, running = 0, uninterruptible = 0;
02775
02776 for_each_online_cpu(i) {
02777 running += cpu_rq(i)->nr_running;
02778 uninterruptible += cpu_rq(i)->nr_uninterruptible;
02779 }
02780
02781 if (unlikely((long)uninterruptible < 0))
02782 uninterruptible = 0;
02783
02784 return running + uninterruptible;
02785 }
02786
02787
02788
02789
02790
02791 static void update_cpu_load(struct rq *this_rq)
02792 {
02793 unsigned long this_load = this_rq->load.weight;
02794 int i, scale;
02795
02796 this_rq->nr_load_updates++;
02797
02798
02799 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
02800 unsigned long old_load, new_load;
02801
02802
02803
02804 old_load = this_rq->cpu_load[i];
02805 new_load = this_load;
02806
02807
02808
02809
02810
02811 if (new_load > old_load)
02812 new_load += scale-1;
02813 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
02814 }
02815 }
02816
02817 #ifdef CONFIG_SMP
02818
02819
02820
02821
02822
02823
02824
02825 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
02826 __acquires(rq1->lock)
02827 __acquires(rq2->lock)
02828 {
02829 BUG_ON(!irqs_disabled());
02830 if (rq1 == rq2) {
02831 spin_lock(&rq1->lock);
02832 __acquire(rq2->lock);
02833 } else {
02834 if (rq1 < rq2) {
02835 spin_lock(&rq1->lock);
02836 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
02837 } else {
02838 spin_lock(&rq2->lock);
02839 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
02840 }
02841 }
02842 update_rq_clock(rq1);
02843 update_rq_clock(rq2);
02844 }
02845
02846
02847
02848
02849
02850
02851
02852 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
02853 __releases(rq1->lock)
02854 __releases(rq2->lock)
02855 {
02856 spin_unlock(&rq1->lock);
02857 if (rq1 != rq2)
02858 spin_unlock(&rq2->lock);
02859 else
02860 __release(rq2->lock);
02861 }
02862
02863
02864
02865
02866
02867
02868
02869 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
02870 {
02871 struct migration_req req;
02872 unsigned long flags;
02873 struct rq *rq;
02874
02875 rq = task_rq_lock(p, &flags);
02876 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
02877 || unlikely(!cpu_active(dest_cpu)))
02878 goto out;
02879
02880
02881 if (migrate_task(p, dest_cpu, &req)) {
02882
02883 struct task_struct *mt = rq->migration_thread;
02884
02885 get_task_struct(mt);
02886 task_rq_unlock(rq, &flags);
02887 wake_up_process(mt);
02888 put_task_struct(mt);
02889 wait_for_completion(&req.done);
02890
02891 return;
02892 }
02893 out:
02894 task_rq_unlock(rq, &flags);
02895 }
02896
02897
02898
02899
02900
02901 void sched_exec(void)
02902 {
02903 int new_cpu, this_cpu = get_cpu();
02904 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
02905 put_cpu();
02906 if (new_cpu != this_cpu)
02907 sched_migrate_task(current, new_cpu);
02908 }
02909
02910
02911
02912
02913
02914 static void pull_task(struct rq *src_rq, struct task_struct *p,
02915 struct rq *this_rq, int this_cpu)
02916 {
02917 deactivate_task(src_rq, p, 0);
02918 set_task_cpu(p, this_cpu);
02919 activate_task(this_rq, p, 0);
02920
02921
02922
02923
02924 check_preempt_curr(this_rq, p, 0);
02925 }
02926
02927
02928
02929
02930 static
02931 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
02932 struct sched_domain *sd, enum cpu_idle_type idle,
02933 int *all_pinned)
02934 {
02935
02936
02937
02938
02939
02940
02941 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
02942 schedstat_inc(p, se.nr_failed_migrations_affine);
02943 return 0;
02944 }
02945 *all_pinned = 0;
02946
02947 if (task_running(rq, p)) {
02948 schedstat_inc(p, se.nr_failed_migrations_running);
02949 return 0;
02950 }
02951
02952
02953
02954
02955
02956
02957
02958 if (!task_hot(p, rq->clock, sd) ||
02959 sd->nr_balance_failed > sd->cache_nice_tries) {
02960 #ifdef CONFIG_SCHEDSTATS
02961 if (task_hot(p, rq->clock, sd)) {
02962 schedstat_inc(sd, lb_hot_gained[idle]);
02963 schedstat_inc(p, se.nr_forced_migrations);
02964 }
02965 #endif
02966 return 1;
02967 }
02968
02969 if (task_hot(p, rq->clock, sd)) {
02970 schedstat_inc(p, se.nr_failed_migrations_hot);
02971 return 0;
02972 }
02973 return 1;
02974 }
02975
02976 static unsigned long
02977 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
02978 unsigned long max_load_move, struct sched_domain *sd,
02979 enum cpu_idle_type idle, int *all_pinned,
02980 int *this_best_prio, struct rq_iterator *iterator)
02981 {
02982 int loops = 0, pulled = 0, pinned = 0;
02983 struct task_struct *p;
02984 long rem_load_move = max_load_move;
02985
02986 if (max_load_move == 0)
02987 goto out;
02988
02989 pinned = 1;
02990
02991
02992
02993
02994 p = iterator->start(iterator->arg);
02995 next:
02996 if (!p || loops++ > sysctl_sched_nr_migrate)
02997 goto out;
02998
02999 if ((p->se.load.weight >> 1) > rem_load_move ||
03000 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
03001 p = iterator->next(iterator->arg);
03002 goto next;
03003 }
03004
03005 pull_task(busiest, p, this_rq, this_cpu);
03006 pulled++;
03007 rem_load_move -= p->se.load.weight;
03008
03009
03010
03011
03012 if (rem_load_move > 0) {
03013 if (p->prio < *this_best_prio)
03014 *this_best_prio = p->prio;
03015 p = iterator->next(iterator->arg);
03016 goto next;
03017 }
03018 out:
03019
03020
03021
03022
03023
03024 schedstat_add(sd, lb_gained[idle], pulled);
03025
03026 if (all_pinned)
03027 *all_pinned = pinned;
03028
03029 return max_load_move - rem_load_move;
03030 }
03031
03032
03033
03034
03035
03036
03037
03038
03039 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
03040 unsigned long max_load_move,
03041 struct sched_domain *sd, enum cpu_idle_type idle,
03042 int *all_pinned)
03043 {
03044 const struct sched_class *class = sched_class_highest;
03045 unsigned long total_load_moved = 0;
03046 int this_best_prio = this_rq->curr->prio;
03047
03048 do {
03049 total_load_moved +=
03050 class->load_balance(this_rq, this_cpu, busiest,
03051 max_load_move - total_load_moved,
03052 sd, idle, all_pinned, &this_best_prio);
03053 class = class->next;
03054
03055 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
03056 break;
03057
03058 } while (class && max_load_move > total_load_moved);
03059
03060 return total_load_moved > 0;
03061 }
03062
03063 static int
03064 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
03065 struct sched_domain *sd, enum cpu_idle_type idle,
03066 struct rq_iterator *iterator)
03067 {
03068 struct task_struct *p = iterator->start(iterator->arg);
03069 int pinned = 0;
03070
03071 while (p) {
03072 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
03073 pull_task(busiest, p, this_rq, this_cpu);
03074
03075
03076
03077
03078
03079 schedstat_inc(sd, lb_gained[idle]);
03080
03081 return 1;
03082 }
03083 p = iterator->next(iterator->arg);
03084 }
03085
03086 return 0;
03087 }
03088
03089
03090
03091
03092
03093
03094
03095
03096 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
03097 struct sched_domain *sd, enum cpu_idle_type idle)
03098 {
03099 const struct sched_class *class;
03100
03101 for (class = sched_class_highest; class; class = class->next)
03102 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
03103 return 1;
03104
03105 return 0;
03106 }
03107
03108
03109
03110
03111
03112
03113 static struct sched_group *
03114 find_busiest_group(struct sched_domain *sd, int this_cpu,
03115 unsigned long *imbalance, enum cpu_idle_type idle,
03116 int *sd_idle, const struct cpumask *cpus, int *balance)
03117 {
03118 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
03119 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
03120 unsigned long max_pull;
03121 unsigned long busiest_load_per_task, busiest_nr_running;
03122 unsigned long this_load_per_task, this_nr_running;
03123 int load_idx, group_imb = 0;
03124 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
03125 int power_savings_balance = 1;
03126 unsigned long leader_nr_running = 0, min_load_per_task = 0;
03127 unsigned long min_nr_running = ULONG_MAX;
03128 struct sched_group *group_min = NULL, *group_leader = NULL;
03129 #endif
03130
03131 max_load = this_load = total_load = total_pwr = 0;
03132 busiest_load_per_task = busiest_nr_running = 0;
03133 this_load_per_task = this_nr_running = 0;
03134
03135 if (idle == CPU_NOT_IDLE)
03136 load_idx = sd->busy_idx;
03137 else if (idle == CPU_NEWLY_IDLE)
03138 load_idx = sd->newidle_idx;
03139 else
03140 load_idx = sd->idle_idx;
03141
03142 do {
03143 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
03144 int local_group;
03145 int i;
03146 int __group_imb = 0;
03147 unsigned int balance_cpu = -1, first_idle_cpu = 0;
03148 unsigned long sum_nr_running, sum_weighted_load;
03149 unsigned long sum_avg_load_per_task;
03150 unsigned long avg_load_per_task;
03151
03152 local_group = cpumask_test_cpu(this_cpu,
03153 sched_group_cpus(group));
03154
03155 if (local_group)
03156 balance_cpu = cpumask_first(sched_group_cpus(group));
03157
03158
03159 sum_weighted_load = sum_nr_running = avg_load = 0;
03160 sum_avg_load_per_task = avg_load_per_task = 0;
03161
03162 max_cpu_load = 0;
03163 min_cpu_load = ~0UL;
03164
03165 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
03166 struct rq *rq = cpu_rq(i);
03167
03168 if (*sd_idle && rq->nr_running)
03169 *sd_idle = 0;
03170
03171
03172 if (local_group) {
03173 if (idle_cpu(i) && !first_idle_cpu) {
03174 first_idle_cpu = 1;
03175 balance_cpu = i;
03176 }
03177
03178 load = target_load(i, load_idx);
03179 } else {
03180 load = source_load(i, load_idx);
03181 if (load > max_cpu_load)
03182 max_cpu_load = load;
03183 if (min_cpu_load > load)
03184 min_cpu_load = load;
03185 }
03186
03187 avg_load += load;
03188 sum_nr_running += rq->nr_running;
03189 sum_weighted_load += weighted_cpuload(i);
03190
03191 sum_avg_load_per_task += cpu_avg_load_per_task(i);
03192 }
03193
03194
03195
03196
03197
03198
03199
03200 if (idle != CPU_NEWLY_IDLE && local_group &&
03201 balance_cpu != this_cpu && balance) {
03202 *balance = 0;
03203 goto ret;
03204 }
03205
03206 total_load += avg_load;
03207 total_pwr += group->__cpu_power;
03208
03209
03210 avg_load = sg_div_cpu_power(group,
03211 avg_load * SCHED_LOAD_SCALE);
03212
03213
03214
03215
03216
03217
03218
03219
03220
03221
03222
03223 avg_load_per_task = sg_div_cpu_power(group,
03224 sum_avg_load_per_task * SCHED_LOAD_SCALE);
03225
03226 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
03227 __group_imb = 1;
03228
03229 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
03230
03231 if (local_group) {
03232 this_load = avg_load;
03233 this = group;
03234 this_nr_running = sum_nr_running;
03235 this_load_per_task = sum_weighted_load;
03236 } else if (avg_load > max_load &&
03237 (sum_nr_running > group_capacity || __group_imb)) {
03238 max_load = avg_load;
03239 busiest = group;
03240 busiest_nr_running = sum_nr_running;
03241 busiest_load_per_task = sum_weighted_load;
03242 group_imb = __group_imb;
03243 }
03244
03245 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
03246
03247
03248
03249
03250 if (idle == CPU_NOT_IDLE ||
03251 !(sd->flags & SD_POWERSAVINGS_BALANCE))
03252 goto group_next;
03253
03254
03255
03256
03257
03258 if (local_group && (this_nr_running >= group_capacity ||
03259 !this_nr_running))
03260 power_savings_balance = 0;
03261
03262
03263
03264
03265
03266 if (!power_savings_balance || sum_nr_running >= group_capacity
03267 || !sum_nr_running)
03268 goto group_next;
03269
03270
03271
03272
03273
03274
03275 if ((sum_nr_running < min_nr_running) ||
03276 (sum_nr_running == min_nr_running &&
03277 cpumask_first(sched_group_cpus(group)) >
03278 cpumask_first(sched_group_cpus(group_min)))) {
03279 group_min = group;
03280 min_nr_running = sum_nr_running;
03281 min_load_per_task = sum_weighted_load /
03282 sum_nr_running;
03283 }
03284
03285
03286
03287
03288
03289
03290 if (sum_nr_running <= group_capacity - 1) {
03291 if (sum_nr_running > leader_nr_running ||
03292 (sum_nr_running == leader_nr_running &&
03293 cpumask_first(sched_group_cpus(group)) <
03294 cpumask_first(sched_group_cpus(group_leader)))) {
03295 group_leader = group;
03296 leader_nr_running = sum_nr_running;
03297 }
03298 }
03299 group_next:
03300 #endif
03301 group = group->next;
03302 } while (group != sd->groups);
03303
03304 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
03305 goto out_balanced;
03306
03307 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
03308
03309 if (this_load >= avg_load ||
03310 100*max_load <= sd->imbalance_pct*this_load)
03311 goto out_balanced;
03312
03313 busiest_load_per_task /= busiest_nr_running;
03314 if (group_imb)
03315 busiest_load_per_task = min(busiest_load_per_task, avg_load);
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328 if (max_load <= busiest_load_per_task)
03329 goto out_balanced;
03330
03331
03332
03333
03334
03335
03336 if (max_load < avg_load) {
03337 *imbalance = 0;
03338 goto small_imbalance;
03339 }
03340
03341
03342 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
03343
03344
03345 *imbalance = min(max_pull * busiest->__cpu_power,
03346 (avg_load - this_load) * this->__cpu_power)
03347 / SCHED_LOAD_SCALE;
03348
03349
03350
03351
03352
03353
03354
03355 if (*imbalance < busiest_load_per_task) {
03356 unsigned long tmp, pwr_now, pwr_move;
03357 unsigned int imbn;
03358
03359 small_imbalance:
03360 pwr_move = pwr_now = 0;
03361 imbn = 2;
03362 if (this_nr_running) {
03363 this_load_per_task /= this_nr_running;
03364 if (busiest_load_per_task > this_load_per_task)
03365 imbn = 1;
03366 } else
03367 this_load_per_task = cpu_avg_load_per_task(this_cpu);
03368
03369 if (max_load - this_load + busiest_load_per_task >=
03370 busiest_load_per_task * imbn) {
03371 *imbalance = busiest_load_per_task;
03372 return busiest;
03373 }
03374
03375
03376
03377
03378
03379
03380
03381 pwr_now += busiest->__cpu_power *
03382 min(busiest_load_per_task, max_load);
03383 pwr_now += this->__cpu_power *
03384 min(this_load_per_task, this_load);
03385 pwr_now /= SCHED_LOAD_SCALE;
03386
03387
03388 tmp = sg_div_cpu_power(busiest,
03389 busiest_load_per_task * SCHED_LOAD_SCALE);
03390 if (max_load > tmp)
03391 pwr_move += busiest->__cpu_power *
03392 min(busiest_load_per_task, max_load - tmp);
03393
03394
03395 if (max_load * busiest->__cpu_power <
03396 busiest_load_per_task * SCHED_LOAD_SCALE)
03397 tmp = sg_div_cpu_power(this,
03398 max_load * busiest->__cpu_power);
03399 else
03400 tmp = sg_div_cpu_power(this,
03401 busiest_load_per_task * SCHED_LOAD_SCALE);
03402 pwr_move += this->__cpu_power *
03403 min(this_load_per_task, this_load + tmp);
03404 pwr_move /= SCHED_LOAD_SCALE;
03405
03406
03407 if (pwr_move > pwr_now)
03408 *imbalance = busiest_load_per_task;
03409 }
03410
03411 return busiest;
03412
03413 out_balanced:
03414 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
03415 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
03416 goto ret;
03417
03418 if (this == group_leader && group_leader != group_min) {
03419 *imbalance = min_load_per_task;
03420 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
03421 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
03422 cpumask_first(sched_group_cpus(group_leader));
03423 }
03424 return group_min;
03425 }
03426 #endif
03427 ret:
03428 *imbalance = 0;
03429 return NULL;
03430 }
03431
03432
03433
03434
03435 static struct rq *
03436 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
03437 unsigned long imbalance, const struct cpumask *cpus)
03438 {
03439 struct rq *busiest = NULL, *rq;
03440 unsigned long max_load = 0;
03441 int i;
03442
03443 for_each_cpu(i, sched_group_cpus(group)) {
03444 unsigned long wl;
03445
03446 if (!cpumask_test_cpu(i, cpus))
03447 continue;
03448
03449 rq = cpu_rq(i);
03450 wl = weighted_cpuload(i);
03451
03452 if (rq->nr_running == 1 && wl > imbalance)
03453 continue;
03454
03455 if (wl > max_load) {
03456 max_load = wl;
03457 busiest = rq;
03458 }
03459 }
03460
03461 return busiest;
03462 }
03463
03464
03465
03466
03467
03468 #define MAX_PINNED_INTERVAL 512
03469
03470
03471
03472
03473
03474 static int load_balance(int this_cpu, struct rq *this_rq,
03475 struct sched_domain *sd, enum cpu_idle_type idle,
03476 int *balance, struct cpumask *cpus)
03477 {
03478 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
03479 struct sched_group *group;
03480 unsigned long imbalance;
03481 struct rq *busiest;
03482 unsigned long flags;
03483
03484 cpumask_setall(cpus);
03485
03486
03487
03488
03489
03490
03491
03492 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
03493 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03494 sd_idle = 1;
03495
03496 schedstat_inc(sd, lb_count[idle]);
03497
03498 redo:
03499 update_shares(sd);
03500 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
03501 cpus, balance);
03502
03503 if (*balance == 0)
03504 goto out_balanced;
03505
03506 if (!group) {
03507 schedstat_inc(sd, lb_nobusyg[idle]);
03508 goto out_balanced;
03509 }
03510
03511 busiest = find_busiest_queue(group, idle, imbalance, cpus);
03512 if (!busiest) {
03513 schedstat_inc(sd, lb_nobusyq[idle]);
03514 goto out_balanced;
03515 }
03516
03517 BUG_ON(busiest == this_rq);
03518
03519 schedstat_add(sd, lb_imbalance[idle], imbalance);
03520
03521 ld_moved = 0;
03522 if (busiest->nr_running > 1) {
03523
03524
03525
03526
03527
03528
03529 local_irq_save(flags);
03530 double_rq_lock(this_rq, busiest);
03531 ld_moved = move_tasks(this_rq, this_cpu, busiest,
03532 imbalance, sd, idle, &all_pinned);
03533 double_rq_unlock(this_rq, busiest);
03534 local_irq_restore(flags);
03535
03536
03537
03538
03539 if (ld_moved && this_cpu != smp_processor_id())
03540 resched_cpu(this_cpu);
03541
03542
03543 if (unlikely(all_pinned)) {
03544 cpumask_clear_cpu(cpu_of(busiest), cpus);
03545 if (!cpumask_empty(cpus))
03546 goto redo;
03547 goto out_balanced;
03548 }
03549 }
03550
03551 if (!ld_moved) {
03552 schedstat_inc(sd, lb_failed[idle]);
03553 sd->nr_balance_failed++;
03554
03555 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
03556
03557 spin_lock_irqsave(&busiest->lock, flags);
03558
03559
03560
03561
03562 if (!cpumask_test_cpu(this_cpu,
03563 &busiest->curr->cpus_allowed)) {
03564 spin_unlock_irqrestore(&busiest->lock, flags);
03565 all_pinned = 1;
03566 goto out_one_pinned;
03567 }
03568
03569 if (!busiest->active_balance) {
03570 busiest->active_balance = 1;
03571 busiest->push_cpu = this_cpu;
03572 active_balance = 1;
03573 }
03574 spin_unlock_irqrestore(&busiest->lock, flags);
03575 if (active_balance)
03576 wake_up_process(busiest->migration_thread);
03577
03578
03579
03580
03581
03582 sd->nr_balance_failed = sd->cache_nice_tries+1;
03583 }
03584 } else
03585 sd->nr_balance_failed = 0;
03586
03587 if (likely(!active_balance)) {
03588
03589 sd->balance_interval = sd->min_interval;
03590 } else {
03591
03592
03593
03594
03595
03596
03597 if (sd->balance_interval < sd->max_interval)
03598 sd->balance_interval *= 2;
03599 }
03600
03601 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
03602 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03603 ld_moved = -1;
03604
03605 goto out;
03606
03607 out_balanced:
03608 schedstat_inc(sd, lb_balanced[idle]);
03609
03610 sd->nr_balance_failed = 0;
03611
03612 out_one_pinned:
03613
03614 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
03615 (sd->balance_interval < sd->max_interval))
03616 sd->balance_interval *= 2;
03617
03618 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
03619 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03620 ld_moved = -1;
03621 else
03622 ld_moved = 0;
03623 out:
03624 if (ld_moved)
03625 update_shares(sd);
03626 return ld_moved;
03627 }
03628
03629
03630
03631
03632
03633
03634
03635
03636 static int
03637 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
03638 struct cpumask *cpus)
03639 {
03640 struct sched_group *group;
03641 struct rq *busiest = NULL;
03642 unsigned long imbalance;
03643 int ld_moved = 0;
03644 int sd_idle = 0;
03645 int all_pinned = 0;
03646
03647 cpumask_setall(cpus);
03648
03649
03650
03651
03652
03653
03654
03655 if (sd->flags & SD_SHARE_CPUPOWER &&
03656 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03657 sd_idle = 1;
03658
03659 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
03660 redo:
03661 update_shares_locked(this_rq, sd);
03662 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
03663 &sd_idle, cpus, NULL);
03664 if (!group) {
03665 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
03666 goto out_balanced;
03667 }
03668
03669 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
03670 if (!busiest) {
03671 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
03672 goto out_balanced;
03673 }
03674
03675 BUG_ON(busiest == this_rq);
03676
03677 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
03678
03679 ld_moved = 0;
03680 if (busiest->nr_running > 1) {
03681
03682 double_lock_balance(this_rq, busiest);
03683
03684 update_rq_clock(busiest);
03685 ld_moved = move_tasks(this_rq, this_cpu, busiest,
03686 imbalance, sd, CPU_NEWLY_IDLE,
03687 &all_pinned);
03688 double_unlock_balance(this_rq, busiest);
03689
03690 if (unlikely(all_pinned)) {
03691 cpumask_clear_cpu(cpu_of(busiest), cpus);
03692 if (!cpumask_empty(cpus))
03693 goto redo;
03694 }
03695 }
03696
03697 if (!ld_moved) {
03698 int active_balance = 0;
03699
03700 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
03701 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
03702 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03703 return -1;
03704
03705 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
03706 return -1;
03707
03708 if (sd->nr_balance_failed++ < 2)
03709 return -1;
03710
03711
03712
03713
03714
03715
03716
03717
03718
03719
03720
03721
03722
03723
03724
03725
03726
03727
03728
03729
03730
03731
03732
03733
03734 double_lock_balance(this_rq, busiest);
03735
03736
03737
03738
03739
03740 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
03741 double_unlock_balance(this_rq, busiest);
03742 all_pinned = 1;
03743 return ld_moved;
03744 }
03745
03746 if (!busiest->active_balance) {
03747 busiest->active_balance = 1;
03748 busiest->push_cpu = this_cpu;
03749 active_balance = 1;
03750 }
03751
03752 double_unlock_balance(this_rq, busiest);
03753
03754
03755
03756 spin_unlock(&this_rq->lock);
03757 if (active_balance)
03758 wake_up_process(busiest->migration_thread);
03759 spin_lock(&this_rq->lock);
03760
03761 } else
03762 sd->nr_balance_failed = 0;
03763
03764 update_shares_locked(this_rq, sd);
03765 return ld_moved;
03766
03767 out_balanced:
03768 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
03769 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
03770 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
03771 return -1;
03772 sd->nr_balance_failed = 0;
03773
03774 return 0;
03775 }
03776
03777
03778
03779
03780
03781 static void idle_balance(int this_cpu, struct rq *this_rq)
03782 {
03783 struct sched_domain *sd;
03784 int pulled_task = 0;
03785 unsigned long next_balance = jiffies + HZ;
03786 cpumask_var_t tmpmask;
03787
03788 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
03789 return;
03790
03791 for_each_domain(this_cpu, sd) {
03792 unsigned long interval;
03793
03794 if (!(sd->flags & SD_LOAD_BALANCE))
03795 continue;
03796
03797 if (sd->flags & SD_BALANCE_NEWIDLE)
03798
03799 pulled_task = load_balance_newidle(this_cpu, this_rq,
03800 sd, tmpmask);
03801
03802 interval = msecs_to_jiffies(sd->balance_interval);
03803 if (time_after(next_balance, sd->last_balance + interval))
03804 next_balance = sd->last_balance + interval;
03805 if (pulled_task)
03806 break;
03807 }
03808 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
03809
03810
03811
03812
03813 this_rq->next_balance = next_balance;
03814 }
03815 free_cpumask_var(tmpmask);
03816 }
03817
03818
03819
03820
03821
03822
03823
03824
03825
03826 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
03827 {
03828 int target_cpu = busiest_rq->push_cpu;
03829 struct sched_domain *sd;
03830 struct rq *target_rq;
03831
03832
03833 if (busiest_rq->nr_running <= 1)
03834 return;
03835
03836 target_rq = cpu_rq(target_cpu);
03837
03838
03839
03840
03841
03842
03843 BUG_ON(busiest_rq == target_rq);
03844
03845
03846 double_lock_balance(busiest_rq, target_rq);
03847 update_rq_clock(busiest_rq);
03848 update_rq_clock(target_rq);
03849
03850
03851 for_each_domain(target_cpu, sd) {
03852 if ((sd->flags & SD_LOAD_BALANCE) &&
03853 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
03854 break;
03855 }
03856
03857 if (likely(sd)) {
03858 schedstat_inc(sd, alb_count);
03859
03860 if (move_one_task(target_rq, target_cpu, busiest_rq,
03861 sd, CPU_IDLE))
03862 schedstat_inc(sd, alb_pushed);
03863 else
03864 schedstat_inc(sd, alb_failed);
03865 }
03866 double_unlock_balance(busiest_rq, target_rq);
03867 }
03868
03869 #ifdef CONFIG_NO_HZ
03870 static struct {
03871 atomic_t load_balancer;
03872 cpumask_var_t cpu_mask;
03873 } nohz ____cacheline_aligned = {
03874 .load_balancer = ATOMIC_INIT(-1),
03875 };
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897 int select_nohz_load_balancer(int stop_tick)
03898 {
03899 int cpu = smp_processor_id();
03900
03901 if (stop_tick) {
03902 cpu_rq(cpu)->in_nohz_recently = 1;
03903
03904 if (!cpu_active(cpu)) {
03905 if (atomic_read(&nohz.load_balancer) != cpu)
03906 return 0;
03907
03908
03909
03910
03911
03912 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
03913 BUG();
03914
03915 return 0;
03916 }
03917
03918 cpumask_set_cpu(cpu, nohz.cpu_mask);
03919
03920
03921 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
03922 if (atomic_read(&nohz.load_balancer) == cpu)
03923 atomic_set(&nohz.load_balancer, -1);
03924 return 0;
03925 }
03926
03927 if (atomic_read(&nohz.load_balancer) == -1) {
03928
03929 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
03930 return 1;
03931 } else if (atomic_read(&nohz.load_balancer) == cpu)
03932 return 1;
03933 } else {
03934 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
03935 return 0;
03936
03937 cpumask_clear_cpu(cpu, nohz.cpu_mask);
03938
03939 if (atomic_read(&nohz.load_balancer) == cpu)
03940 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
03941 BUG();
03942 }
03943 return 0;
03944 }
03945 #endif
03946
03947 static DEFINE_SPINLOCK(balancing);
03948
03949
03950
03951
03952
03953
03954
03955 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
03956 {
03957 int balance = 1;
03958 struct rq *rq = cpu_rq(cpu);
03959 unsigned long interval;
03960 struct sched_domain *sd;
03961
03962 unsigned long next_balance = jiffies + 60*HZ;
03963 int update_next_balance = 0;
03964 int need_serialize;
03965 cpumask_var_t tmp;
03966
03967
03968 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
03969 return;
03970
03971 for_each_domain(cpu, sd) {
03972 if (!(sd->flags & SD_LOAD_BALANCE))
03973 continue;
03974
03975 interval = sd->balance_interval;
03976 if (idle != CPU_IDLE)
03977 interval *= sd->busy_factor;
03978
03979
03980 interval = msecs_to_jiffies(interval);
03981 if (unlikely(!interval))
03982 interval = 1;
03983 if (interval > HZ*NR_CPUS/10)
03984 interval = HZ*NR_CPUS/10;
03985
03986 need_serialize = sd->flags & SD_SERIALIZE;
03987
03988 if (need_serialize) {
03989 if (!spin_trylock(&balancing))
03990 goto out;
03991 }
03992
03993 if (time_after_eq(jiffies, sd->last_balance + interval)) {
03994 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
03995
03996
03997
03998
03999
04000 idle = CPU_NOT_IDLE;
04001 }
04002 sd->last_balance = jiffies;
04003 }
04004 if (need_serialize)
04005 spin_unlock(&balancing);
04006 out:
04007 if (time_after(next_balance, sd->last_balance + interval)) {
04008 next_balance = sd->last_balance + interval;
04009 update_next_balance = 1;
04010 }
04011
04012
04013
04014
04015
04016
04017 if (!balance)
04018 break;
04019 }
04020
04021
04022
04023
04024
04025
04026 if (likely(update_next_balance))
04027 rq->next_balance = next_balance;
04028
04029 free_cpumask_var(tmp);
04030 }
04031
04032
04033
04034
04035
04036
04037 static void run_rebalance_domains(struct softirq_action *h)
04038 {
04039 int this_cpu = smp_processor_id();
04040 struct rq *this_rq = cpu_rq(this_cpu);
04041 enum cpu_idle_type idle = this_rq->idle_at_tick ?
04042 CPU_IDLE : CPU_NOT_IDLE;
04043
04044 rebalance_domains(this_cpu, idle);
04045
04046 #ifdef CONFIG_NO_HZ
04047
04048
04049
04050
04051
04052 if (this_rq->idle_at_tick &&
04053 atomic_read(&nohz.load_balancer) == this_cpu) {
04054 struct rq *rq;
04055 int balance_cpu;
04056
04057 for_each_cpu(balance_cpu, nohz.cpu_mask) {
04058 if (balance_cpu == this_cpu)
04059 continue;
04060
04061
04062
04063
04064
04065
04066 if (need_resched())
04067 break;
04068
04069 rebalance_domains(balance_cpu, CPU_IDLE);
04070
04071 rq = cpu_rq(balance_cpu);
04072 if (time_after(this_rq->next_balance, rq->next_balance))
04073 this_rq->next_balance = rq->next_balance;
04074 }
04075 }
04076 #endif
04077 }
04078
04079
04080
04081
04082
04083
04084
04085
04086 static inline void trigger_load_balance(struct rq *rq, int cpu)
04087 {
04088 #ifdef CONFIG_NO_HZ
04089
04090
04091
04092
04093
04094 if (rq->in_nohz_recently && !rq->idle_at_tick) {
04095 rq->in_nohz_recently = 0;
04096
04097 if (atomic_read(&nohz.load_balancer) == cpu) {
04098 cpumask_clear_cpu(cpu, nohz.cpu_mask);
04099 atomic_set(&nohz.load_balancer, -1);
04100 }
04101
04102 if (atomic_read(&nohz.load_balancer) == -1) {
04103
04104
04105
04106
04107
04108
04109
04110
04111 int ilb = cpumask_first(nohz.cpu_mask);
04112
04113 if (ilb < nr_cpu_ids)
04114 resched_cpu(ilb);
04115 }
04116 }
04117
04118
04119
04120
04121
04122 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
04123 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
04124 resched_cpu(cpu);
04125 return;
04126 }
04127
04128
04129
04130
04131
04132 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
04133 cpumask_test_cpu(cpu, nohz.cpu_mask))
04134 return;
04135 #endif
04136 if (time_after_eq(jiffies, rq->next_balance))
04137 raise_softirq(SCHED_SOFTIRQ);
04138 }
04139
04140 #else
04141
04142
04143
04144
04145 static inline void idle_balance(int cpu, struct rq *rq)
04146 {
04147 }
04148
04149 #endif
04150
04151 DEFINE_PER_CPU(struct kernel_stat, kstat);
04152
04153 EXPORT_PER_CPU_SYMBOL(kstat);
04154
04155
04156
04157
04158
04159 unsigned long long task_delta_exec(struct task_struct *p)
04160 {
04161 unsigned long flags;
04162 struct rq *rq;
04163 u64 ns = 0;
04164
04165 rq = task_rq_lock(p, &flags);
04166
04167 if (task_current(rq, p)) {
04168 u64 delta_exec;
04169
04170 update_rq_clock(rq);
04171 delta_exec = rq->clock - p->se.exec_start;
04172 if ((s64)delta_exec > 0)
04173 ns = delta_exec;
04174 }
04175
04176 task_rq_unlock(rq, &flags);
04177
04178 return ns;
04179 }
04180
04181
04182
04183
04184
04185
04186
04187 void account_user_time(struct task_struct *p, cputime_t cputime,
04188 cputime_t cputime_scaled)
04189 {
04190 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
04191 cputime64_t tmp;
04192
04193
04194 p->utime = cputime_add(p->utime, cputime);
04195 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
04196 account_group_user_time(p, cputime);
04197
04198
04199 tmp = cputime_to_cputime64(cputime);
04200 if (TASK_NICE(p) > 0)
04201 cpustat->nice = cputime64_add(cpustat->nice, tmp);
04202 else
04203 cpustat->user = cputime64_add(cpustat->user, tmp);
04204
04205 acct_update_integrals(p);
04206 }
04207
04208
04209
04210
04211
04212
04213
04214 static void account_guest_time(struct task_struct *p, cputime_t cputime,
04215 cputime_t cputime_scaled)
04216 {
04217 cputime64_t tmp;
04218 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
04219
04220 tmp = cputime_to_cputime64(cputime);
04221
04222
04223 p->utime = cputime_add(p->utime, cputime);
04224 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
04225 account_group_user_time(p, cputime);
04226 p->gtime = cputime_add(p->gtime, cputime);
04227
04228
04229 cpustat->user = cputime64_add(cpustat->user, tmp);
04230 cpustat->guest = cputime64_add(cpustat->guest, tmp);
04231 }
04232
04233
04234
04235
04236
04237
04238
04239
04240 void account_system_time(struct task_struct *p, int hardirq_offset,
04241 cputime_t cputime, cputime_t cputime_scaled)
04242 {
04243 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
04244 cputime64_t tmp;
04245
04246 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
04247 account_guest_time(p, cputime, cputime_scaled);
04248 return;
04249 }
04250
04251
04252 p->stime = cputime_add(p->stime, cputime);
04253 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
04254 account_group_system_time(p, cputime);
04255
04256
04257 tmp = cputime_to_cputime64(cputime);
04258 if (hardirq_count() - hardirq_offset)
04259 cpustat->irq = cputime64_add(cpustat->irq, tmp);
04260 else if (softirq_count())
04261 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
04262 else
04263 cpustat->system = cputime64_add(cpustat->system, tmp);
04264
04265
04266 acct_update_integrals(p);
04267 }
04268
04269
04270
04271
04272
04273 void account_steal_time(cputime_t cputime)
04274 {
04275 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
04276 cputime64_t cputime64 = cputime_to_cputime64(cputime);
04277
04278 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
04279 }
04280
04281
04282
04283
04284
04285 void account_idle_time(cputime_t cputime)
04286 {
04287 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
04288 cputime64_t cputime64 = cputime_to_cputime64(cputime);
04289 struct rq *rq = this_rq();
04290
04291 if (atomic_read(&rq->nr_iowait) > 0)
04292 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
04293 else
04294 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
04295 }
04296
04297 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
04298
04299
04300
04301
04302
04303
04304 void account_process_tick(struct task_struct *p, int user_tick)
04305 {
04306 cputime_t one_jiffy = jiffies_to_cputime(1);
04307 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
04308 struct rq *rq = this_rq();
04309
04310 if (user_tick)
04311 account_user_time(p, one_jiffy, one_jiffy_scaled);
04312 else if (p != rq->idle)
04313 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
04314 one_jiffy_scaled);
04315 else
04316 account_idle_time(one_jiffy);
04317 }
04318
04319
04320
04321
04322
04323
04324 void account_steal_ticks(unsigned long ticks)
04325 {
04326 account_steal_time(jiffies_to_cputime(ticks));
04327 }
04328
04329
04330
04331
04332
04333 void account_idle_ticks(unsigned long ticks)
04334 {
04335 account_idle_time(jiffies_to_cputime(ticks));
04336 }
04337
04338 #endif
04339
04340
04341
04342
04343 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
04344 cputime_t task_utime(struct task_struct *p)
04345 {
04346 return p->utime;
04347 }
04348
04349 cputime_t task_stime(struct task_struct *p)
04350 {
04351 return p->stime;
04352 }
04353 #else
04354 cputime_t task_utime(struct task_struct *p)
04355 {
04356 clock_t utime = cputime_to_clock_t(p->utime),
04357 total = utime + cputime_to_clock_t(p->stime);
04358 u64 temp;
04359
04360
04361
04362
04363 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
04364
04365 if (total) {
04366 temp *= utime;
04367 do_div(temp, total);
04368 }
04369 utime = (clock_t)temp;
04370
04371 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
04372 return p->prev_utime;
04373 }
04374
04375 cputime_t task_stime(struct task_struct *p)
04376 {
04377 clock_t stime;
04378
04379
04380
04381
04382
04383
04384 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
04385 cputime_to_clock_t(task_utime(p));
04386
04387 if (stime >= 0)
04388 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
04389
04390 return p->prev_stime;
04391 }
04392 #endif
04393
04394 inline cputime_t task_gtime(struct task_struct *p)
04395 {
04396 return p->gtime;
04397 }
04398
04399
04400
04401
04402
04403
04404
04405
04406 void scheduler_tick(void)
04407 {
04408 int cpu = smp_processor_id();
04409 struct rq *rq = cpu_rq(cpu);
04410 struct task_struct *curr = rq->curr;
04411
04412 sched_clock_tick();
04413
04414 spin_lock(&rq->lock);
04415 update_rq_clock(rq);
04416 update_cpu_load(rq);
04417 curr->sched_class->task_tick(rq, curr, 0);
04418 spin_unlock(&rq->lock);
04419
04420 #ifdef CONFIG_SMP
04421 rq->idle_at_tick = idle_cpu(cpu);
04422 trigger_load_balance(rq, cpu);
04423 #endif
04424 }
04425
04426 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
04427 defined(CONFIG_PREEMPT_TRACER))
04428
04429 static inline unsigned long get_parent_ip(unsigned long addr)
04430 {
04431 if (in_lock_functions(addr)) {
04432 addr = CALLER_ADDR2;
04433 if (in_lock_functions(addr))
04434 addr = CALLER_ADDR3;
04435 }
04436 return addr;
04437 }
04438
04439 void __kprobes add_preempt_count(int val)
04440 {
04441 #ifdef CONFIG_DEBUG_PREEMPT
04442
04443
04444
04445 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
04446 return;
04447 #endif
04448 preempt_count() += val;
04449 #ifdef CONFIG_DEBUG_PREEMPT
04450
04451
04452
04453 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
04454 PREEMPT_MASK - 10);
04455 #endif
04456 if (preempt_count() == val)
04457 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
04458 }
04459 EXPORT_SYMBOL(add_preempt_count);
04460
04461 void __kprobes sub_preempt_count(int val)
04462 {
04463 #ifdef CONFIG_DEBUG_PREEMPT
04464
04465
04466
04467 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
04468 return;
04469
04470
04471
04472 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
04473 !(preempt_count() & PREEMPT_MASK)))
04474 return;
04475 #endif
04476
04477 if (preempt_count() == val)
04478 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
04479 preempt_count() -= val;
04480 }
04481 EXPORT_SYMBOL(sub_preempt_count);
04482
04483 #endif
04484
04485
04486
04487
04488 static noinline void __schedule_bug(struct task_struct *prev)
04489 {
04490 struct pt_regs *regs = get_irq_regs();
04491
04492 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
04493 prev->comm, prev->pid, preempt_count());
04494
04495 debug_show_held_locks(prev);
04496 print_modules();
04497 if (irqs_disabled())
04498 print_irqtrace_events(prev);
04499
04500 if (regs)
04501 show_regs(regs);
04502 else
04503 dump_stack();
04504 }
04505
04506
04507
04508
04509 static inline void schedule_debug(struct task_struct *prev)
04510 {
04511
04512
04513
04514
04515
04516 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
04517 __schedule_bug(prev);
04518
04519 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
04520
04521 schedstat_inc(this_rq(), sched_count);
04522 #ifdef CONFIG_SCHEDSTATS
04523 if (unlikely(prev->lock_depth >= 0)) {
04524 schedstat_inc(this_rq(), bkl_count);
04525 schedstat_inc(prev, sched_info.bkl_count);
04526 }
04527 #endif
04528 }
04529
04530
04531
04532
04533 static inline struct task_struct *
04534 pick_next_task(struct rq *rq, struct task_struct *prev)
04535 {
04536 const struct sched_class *class;
04537 struct task_struct *p;
04538
04539
04540
04541
04542
04543 if (likely(rq->nr_running == rq->cfs.nr_running)) {
04544 p = fair_sched_class.pick_next_task(rq);
04545 if (likely(p))
04546 return p;
04547 }
04548
04549 class = sched_class_highest;
04550 for ( ; ; ) {
04551 p = class->pick_next_task(rq);
04552 if (p)
04553 return p;
04554
04555
04556
04557
04558 class = class->next;
04559 }
04560 }
04561
04562
04563
04564
04565 asmlinkage void __sched schedule(void)
04566 {
04567 struct task_struct *prev, *next;
04568 unsigned long *switch_count;
04569 struct rq *rq;
04570 int cpu;
04571
04572 need_resched:
04573 preempt_disable();
04574 cpu = smp_processor_id();
04575 rq = cpu_rq(cpu);
04576 rcu_qsctr_inc(cpu);
04577 prev = rq->curr;
04578 switch_count = &prev->nivcsw;
04579
04580 release_kernel_lock(prev);
04581 need_resched_nonpreemptible:
04582
04583 schedule_debug(prev);
04584
04585 if (sched_feat(HRTICK))
04586 hrtick_clear(rq);
04587
04588 spin_lock_irq(&rq->lock);
04589 update_rq_clock(rq);
04590 clear_tsk_need_resched(prev);
04591
04592 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
04593 if (unlikely(signal_pending_state(prev->state, prev)))
04594 prev->state = TASK_RUNNING;
04595 else
04596 deactivate_task(rq, prev, 1);
04597 switch_count = &prev->nvcsw;
04598 }
04599
04600 #ifdef CONFIG_SMP
04601 if (prev->sched_class->pre_schedule)
04602 prev->sched_class->pre_schedule(rq, prev);
04603 #endif
04604
04605 if (unlikely(!rq->nr_running))
04606 idle_balance(cpu, rq);
04607
04608 prev->sched_class->put_prev_task(rq, prev);
04609 next = pick_next_task(rq, prev);
04610
04611 if (likely(prev != next)) {
04612 sched_info_switch(prev, next);
04613
04614 rq->nr_switches++;
04615 rq->curr = next;
04616 ++*switch_count;
04617
04618 context_switch(rq, prev, next);
04619
04620
04621
04622
04623 cpu = smp_processor_id();
04624 rq = cpu_rq(cpu);
04625 } else
04626 spin_unlock_irq(&rq->lock);
04627
04628 if (unlikely(reacquire_kernel_lock(current) < 0))
04629 goto need_resched_nonpreemptible;
04630
04631 preempt_enable_no_resched();
04632 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
04633 goto need_resched;
04634 }
04635 EXPORT_SYMBOL(schedule);
04636
04637 #ifdef CONFIG_PREEMPT
04638
04639
04640
04641
04642
04643 asmlinkage void __sched preempt_schedule(void)
04644 {
04645 struct thread_info *ti = current_thread_info();
04646
04647
04648
04649
04650
04651 if (likely(ti->preempt_count || irqs_disabled()))
04652 return;
04653
04654 do {
04655 add_preempt_count(PREEMPT_ACTIVE);
04656 schedule();
04657 sub_preempt_count(PREEMPT_ACTIVE);
04658
04659
04660
04661
04662
04663 barrier();
04664 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
04665 }
04666 EXPORT_SYMBOL(preempt_schedule);
04667
04668
04669
04670
04671
04672
04673
04674 asmlinkage void __sched preempt_schedule_irq(void)
04675 {
04676 struct thread_info *ti = current_thread_info();
04677
04678
04679 BUG_ON(ti->preempt_count || !irqs_disabled());
04680
04681 do {
04682 add_preempt_count(PREEMPT_ACTIVE);
04683 local_irq_enable();
04684 schedule();
04685 local_irq_disable();
04686 sub_preempt_count(PREEMPT_ACTIVE);
04687
04688
04689
04690
04691
04692 barrier();
04693 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
04694 }
04695
04696 #endif
04697 #endif
04698
04699 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
04700 void *key)
04701 {
04702 return try_to_wake_up(curr->private, mode, sync);
04703 }
04704 EXPORT_SYMBOL(default_wake_function);
04705
04706
04707
04708
04709
04710
04711
04712
04713
04714
04715 void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
04716 int nr_exclusive, int sync, void *key)
04717 {
04718 wait_queue_t *curr, *next;
04719
04720 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
04721 unsigned flags = curr->flags;
04722
04723 if (curr->func(curr, mode, sync, key) &&
04724 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
04725 break;
04726 }
04727 }
04728
04729
04730
04731
04732
04733
04734
04735
04736 void __wake_up(wait_queue_head_t *q, unsigned int mode,
04737 int nr_exclusive, void *key)
04738 {
04739 unsigned long flags;
04740
04741 spin_lock_irqsave(&q->lock, flags);
04742 __wake_up_common(q, mode, nr_exclusive, 0, key);
04743 spin_unlock_irqrestore(&q->lock, flags);
04744 }
04745 EXPORT_SYMBOL(__wake_up);
04746
04747
04748
04749
04750 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
04751 {
04752 __wake_up_common(q, mode, 1, 0, NULL);
04753 }
04754
04755
04756
04757
04758
04759
04760
04761
04762
04763
04764
04765
04766
04767
04768 void
04769 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
04770 {
04771 unsigned long flags;
04772 int sync = 1;
04773
04774 if (unlikely(!q))
04775 return;
04776
04777 if (unlikely(!nr_exclusive))
04778 sync = 0;
04779
04780 spin_lock_irqsave(&q->lock, flags);
04781 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
04782 spin_unlock_irqrestore(&q->lock, flags);
04783 }
04784 EXPORT_SYMBOL_GPL(__wake_up_sync);
04785
04786
04787
04788
04789
04790
04791
04792
04793
04794
04795 void complete(struct completion *x)
04796 {
04797 unsigned long flags;
04798
04799 spin_lock_irqsave(&x->wait.lock, flags);
04800 x->done++;
04801 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
04802 spin_unlock_irqrestore(&x->wait.lock, flags);
04803 }
04804 EXPORT_SYMBOL(complete);
04805
04806
04807
04808
04809
04810
04811
04812 void complete_all(struct completion *x)
04813 {
04814 unsigned long flags;
04815
04816 spin_lock_irqsave(&x->wait.lock, flags);
04817 x->done += UINT_MAX/2;
04818 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
04819 spin_unlock_irqrestore(&x->wait.lock, flags);
04820 }
04821 EXPORT_SYMBOL(complete_all);
04822
04823 static inline long __sched
04824 do_wait_for_common(struct completion *x, long timeout, int state)
04825 {
04826 if (!x->done) {
04827 DECLARE_WAITQUEUE(wait, current);
04828
04829 wait.flags |= WQ_FLAG_EXCLUSIVE;
04830 __add_wait_queue_tail(&x->wait, &wait);
04831 do {
04832 if (signal_pending_state(state, current)) {
04833 timeout = -ERESTARTSYS;
04834 break;
04835 }
04836 __set_current_state(state);
04837 spin_unlock_irq(&x->wait.lock);
04838 timeout = schedule_timeout(timeout);
04839 spin_lock_irq(&x->wait.lock);
04840 } while (!x->done && timeout);
04841 __remove_wait_queue(&x->wait, &wait);
04842 if (!x->done)
04843 return timeout;
04844 }
04845 x->done--;
04846 return timeout ?: 1;
04847 }
04848
04849 static long __sched
04850 wait_for_common(struct completion *x, long timeout, int state)
04851 {
04852 might_sleep();
04853
04854 spin_lock_irq(&x->wait.lock);
04855 timeout = do_wait_for_common(x, timeout, state);
04856 spin_unlock_irq(&x->wait.lock);
04857 return timeout;
04858 }
04859
04860
04861
04862
04863
04864
04865
04866
04867
04868
04869
04870 void __sched wait_for_completion(struct completion *x)
04871 {
04872 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
04873 }
04874 EXPORT_SYMBOL(wait_for_completion);
04875
04876
04877
04878
04879
04880
04881
04882
04883
04884
04885 unsigned long __sched
04886 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
04887 {
04888 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
04889 }
04890 EXPORT_SYMBOL(wait_for_completion_timeout);
04891
04892
04893
04894
04895
04896
04897
04898
04899 int __sched wait_for_completion_interruptible(struct completion *x)
04900 {
04901 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
04902 if (t == -ERESTARTSYS)
04903 return t;
04904 return 0;
04905 }
04906 EXPORT_SYMBOL(wait_for_completion_interruptible);
04907
04908
04909
04910
04911
04912
04913
04914
04915
04916 unsigned long __sched
04917 wait_for_completion_interruptible_timeout(struct completion *x,
04918 unsigned long timeout)
04919 {
04920 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
04921 }
04922 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
04923
04924
04925
04926
04927
04928
04929
04930
04931 int __sched wait_for_completion_killable(struct completion *x)
04932 {
04933 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
04934 if (t == -ERESTARTSYS)
04935 return t;
04936 return 0;
04937 }
04938 EXPORT_SYMBOL(wait_for_completion_killable);
04939
04940
04941
04942
04943
04944
04945
04946
04947
04948
04949
04950
04951
04952 bool try_wait_for_completion(struct completion *x)
04953 {
04954 int ret = 1;
04955
04956 spin_lock_irq(&x->wait.lock);
04957 if (!x->done)
04958 ret = 0;
04959 else
04960 x->done--;
04961 spin_unlock_irq(&x->wait.lock);
04962 return ret;
04963 }
04964 EXPORT_SYMBOL(try_wait_for_completion);
04965
04966
04967
04968
04969
04970
04971
04972
04973
04974 bool completion_done(struct completion *x)
04975 {
04976 int ret = 1;
04977
04978 spin_lock_irq(&x->wait.lock);
04979 if (!x->done)
04980 ret = 0;
04981 spin_unlock_irq(&x->wait.lock);
04982 return ret;
04983 }
04984 EXPORT_SYMBOL(completion_done);
04985
04986 static long __sched
04987 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
04988 {
04989 unsigned long flags;
04990 wait_queue_t wait;
04991
04992 init_waitqueue_entry(&wait, current);
04993
04994 __set_current_state(state);
04995
04996 spin_lock_irqsave(&q->lock, flags);
04997 __add_wait_queue(q, &wait);
04998 spin_unlock(&q->lock);
04999 timeout = schedule_timeout(timeout);
05000 spin_lock_irq(&q->lock);
05001 __remove_wait_queue(q, &wait);
05002 spin_unlock_irqrestore(&q->lock, flags);
05003
05004 return timeout;
05005 }
05006
05007 #ifndef DDE_LINUX
05008 void __sched interruptible_sleep_on(wait_queue_head_t *q)
05009 {
05010 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
05011 }
05012 EXPORT_SYMBOL(interruptible_sleep_on);
05013
05014 long __sched
05015 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
05016 {
05017 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
05018 }
05019 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
05020
05021 void __sched sleep_on(wait_queue_head_t *q)
05022 {
05023 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
05024 }
05025 EXPORT_SYMBOL(sleep_on);
05026
05027 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
05028 {
05029 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
05030 }
05031 EXPORT_SYMBOL(sleep_on_timeout);
05032
05033 #ifdef CONFIG_RT_MUTEXES
05034
05035
05036
05037
05038
05039
05040
05041
05042
05043
05044
05045 void rt_mutex_setprio(struct task_struct *p, int prio)
05046 {
05047 unsigned long flags;
05048 int oldprio, on_rq, running;
05049 struct rq *rq;
05050 const struct sched_class *prev_class = p->sched_class;
05051
05052 BUG_ON(prio < 0 || prio > MAX_PRIO);
05053
05054 rq = task_rq_lock(p, &flags);
05055 update_rq_clock(rq);
05056
05057 oldprio = p->prio;
05058 on_rq = p->se.on_rq;
05059 running = task_current(rq, p);
05060 if (on_rq)
05061 dequeue_task(rq, p, 0);
05062 if (running)
05063 p->sched_class->put_prev_task(rq, p);
05064
05065 if (rt_prio(prio))
05066 p->sched_class = &rt_sched_class;
05067 else
05068 p->sched_class = &fair_sched_class;
05069
05070 p->prio = prio;
05071
05072 if (running)
05073 p->sched_class->set_curr_task(rq);
05074 if (on_rq) {
05075 enqueue_task(rq, p, 0);
05076
05077 check_class_changed(rq, p, prev_class, oldprio, running);
05078 }
05079 task_rq_unlock(rq, &flags);
05080 }
05081
05082 #endif
05083
05084 void set_user_nice(struct task_struct *p, long nice)
05085 {
05086 int old_prio, delta, on_rq;
05087 unsigned long flags;
05088 struct rq *rq;
05089
05090 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
05091 return;
05092
05093
05094
05095
05096 rq = task_rq_lock(p, &flags);
05097 update_rq_clock(rq);
05098
05099
05100
05101
05102
05103
05104 if (task_has_rt_policy(p)) {
05105 p->static_prio = NICE_TO_PRIO(nice);
05106 goto out_unlock;
05107 }
05108 on_rq = p->se.on_rq;
05109 if (on_rq)
05110 dequeue_task(rq, p, 0);
05111
05112 p->static_prio = NICE_TO_PRIO(nice);
05113 set_load_weight(p);
05114 old_prio = p->prio;
05115 p->prio = effective_prio(p);
05116 delta = p->prio - old_prio;
05117
05118 if (on_rq) {
05119 enqueue_task(rq, p, 0);
05120
05121
05122
05123
05124 if (delta < 0 || (delta > 0 && task_running(rq, p)))
05125 resched_task(rq->curr);
05126 }
05127 out_unlock:
05128 task_rq_unlock(rq, &flags);
05129 }
05130 EXPORT_SYMBOL(set_user_nice);
05131
05132
05133
05134
05135
05136
05137 int can_nice(const struct task_struct *p, const int nice)
05138 {
05139
05140 int nice_rlim = 20 - nice;
05141
05142 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
05143 capable(CAP_SYS_NICE));
05144 }
05145
05146 #ifdef __ARCH_WANT_SYS_NICE
05147
05148
05149
05150
05151
05152
05153
05154
05155 SYSCALL_DEFINE1(nice, int, increment)
05156 {
05157 long nice, retval;
05158
05159
05160
05161
05162
05163
05164 if (increment < -40)
05165 increment = -40;
05166 if (increment > 40)
05167 increment = 40;
05168
05169 nice = PRIO_TO_NICE(current->static_prio) + increment;
05170 if (nice < -20)
05171 nice = -20;
05172 if (nice > 19)
05173 nice = 19;
05174
05175 if (increment < 0 && !can_nice(current, nice))
05176 return -EPERM;
05177
05178 retval = security_task_setnice(current, nice);
05179 if (retval)
05180 return retval;
05181
05182 set_user_nice(current, nice);
05183 return 0;
05184 }
05185
05186 #endif
05187
05188
05189
05190
05191
05192
05193
05194
05195
05196 int task_prio(const struct task_struct *p)
05197 {
05198 return p->prio - MAX_RT_PRIO;
05199 }
05200
05201
05202
05203
05204
05205 int task_nice(const struct task_struct *p)
05206 {
05207 return TASK_NICE(p);
05208 }
05209 EXPORT_SYMBOL(task_nice);
05210
05211
05212
05213
05214
05215 int idle_cpu(int cpu)
05216 {
05217 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
05218 }
05219
05220
05221
05222
05223
05224 struct task_struct *idle_task(int cpu)
05225 {
05226 return cpu_rq(cpu)->idle;
05227 }
05228
05229
05230
05231
05232
05233 static struct task_struct *find_process_by_pid(pid_t pid)
05234 {
05235 return pid ? find_task_by_vpid(pid) : current;
05236 }
05237
05238
05239 static void
05240 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
05241 {
05242 BUG_ON(p->se.on_rq);
05243
05244 p->policy = policy;
05245 switch (p->policy) {
05246 case SCHED_NORMAL:
05247 case SCHED_BATCH:
05248 case SCHED_IDLE:
05249 p->sched_class = &fair_sched_class;
05250 break;
05251 case SCHED_FIFO:
05252 case SCHED_RR:
05253 p->sched_class = &rt_sched_class;
05254 break;
05255 }
05256
05257 p->rt_priority = prio;
05258 p->normal_prio = normal_prio(p);
05259
05260 p->prio = rt_mutex_getprio(p);
05261 set_load_weight(p);
05262 }
05263 #endif
05264
05265
05266
05267
05268 static bool check_same_owner(struct task_struct *p)
05269 {
05270 const struct cred *cred = current_cred(), *pcred;
05271 bool match;
05272
05273 rcu_read_lock();
05274 pcred = __task_cred(p);
05275 match = (cred->euid == pcred->euid ||
05276 cred->euid == pcred->uid);
05277 rcu_read_unlock();
05278 return match;
05279 }
05280
05281 static int __sched_setscheduler(struct task_struct *p, int policy,
05282 struct sched_param *param, bool user)
05283 {
05284 #ifndef DDE_LINUX
05285 int retval, oldprio, oldpolicy = -1, on_rq, running;
05286 unsigned long flags;
05287 const struct sched_class *prev_class = p->sched_class;
05288 struct rq *rq;
05289
05290
05291 BUG_ON(in_interrupt());
05292 recheck:
05293
05294 if (policy < 0)
05295 policy = oldpolicy = p->policy;
05296 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
05297 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
05298 policy != SCHED_IDLE)
05299 return -EINVAL;
05300
05301
05302
05303
05304
05305 if (param->sched_priority < 0 ||
05306 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
05307 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
05308 return -EINVAL;
05309 if (rt_policy(policy) != (param->sched_priority != 0))
05310 return -EINVAL;
05311
05312
05313
05314
05315 if (user && !capable(CAP_SYS_NICE)) {
05316 if (rt_policy(policy)) {
05317 unsigned long rlim_rtprio;
05318
05319 if (!lock_task_sighand(p, &flags))
05320 return -ESRCH;
05321 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
05322 unlock_task_sighand(p, &flags);
05323
05324
05325 if (policy != p->policy && !rlim_rtprio)
05326 return -EPERM;
05327
05328
05329 if (param->sched_priority > p->rt_priority &&
05330 param->sched_priority > rlim_rtprio)
05331 return -EPERM;
05332 }
05333
05334
05335
05336
05337 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
05338 return -EPERM;
05339
05340
05341 if (!check_same_owner(p))
05342 return -EPERM;
05343 }
05344
05345 if (user) {
05346 #ifdef CONFIG_RT_GROUP_SCHED
05347
05348
05349
05350
05351 if (rt_bandwidth_enabled() && rt_policy(policy) &&
05352 task_group(p)->rt_bandwidth.rt_runtime == 0)
05353 return -EPERM;
05354 #endif
05355
05356 retval = security_task_setscheduler(p, policy, param);
05357 if (retval)
05358 return retval;
05359 }
05360
05361
05362
05363
05364
05365 spin_lock_irqsave(&p->pi_lock, flags);
05366
05367
05368
05369
05370 rq = __task_rq_lock(p);
05371
05372 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
05373 policy = oldpolicy = -1;
05374 __task_rq_unlock(rq);
05375 spin_unlock_irqrestore(&p->pi_lock, flags);
05376 goto recheck;
05377 }
05378 update_rq_clock(rq);
05379 on_rq = p->se.on_rq;
05380 running = task_current(rq, p);
05381 if (on_rq)
05382 deactivate_task(rq, p, 0);
05383 if (running)
05384 p->sched_class->put_prev_task(rq, p);
05385
05386 oldprio = p->prio;
05387 __setscheduler(rq, p, policy, param->sched_priority);
05388
05389 if (running)
05390 p->sched_class->set_curr_task(rq);
05391 if (on_rq) {
05392 activate_task(rq, p, 0);
05393
05394 check_class_changed(rq, p, prev_class, oldprio, running);
05395 }
05396 __task_rq_unlock(rq);
05397 spin_unlock_irqrestore(&p->pi_lock, flags);
05398
05399 rt_mutex_adjust_pi(p);
05400
05401 return 0;
05402 #else
05403
05404 return 0;
05405 #endif
05406 }
05407
05408
05409
05410
05411
05412
05413
05414
05415
05416 int sched_setscheduler(struct task_struct *p, int policy,
05417 struct sched_param *param)
05418 {
05419 return __sched_setscheduler(p, policy, param, true);
05420 }
05421 EXPORT_SYMBOL_GPL(sched_setscheduler);
05422
05423 #ifndef DDE_LINUX
05424
05425
05426
05427
05428
05429
05430
05431
05432
05433
05434
05435
05436 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
05437 struct sched_param *param)
05438 {
05439 return __sched_setscheduler(p, policy, param, false);
05440 }
05441
05442 static int
05443 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
05444 {
05445 struct sched_param lparam;
05446 struct task_struct *p;
05447 int retval;
05448
05449 if (!param || pid < 0)
05450 return -EINVAL;
05451 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
05452 return -EFAULT;
05453
05454 rcu_read_lock();
05455 retval = -ESRCH;
05456 p = find_process_by_pid(pid);
05457 if (p != NULL)
05458 retval = sched_setscheduler(p, policy, &lparam);
05459 rcu_read_unlock();
05460
05461 return retval;
05462 }
05463
05464
05465
05466
05467
05468
05469
05470 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
05471 struct sched_param __user *, param)
05472 {
05473
05474 if (policy < 0)
05475 return -EINVAL;
05476
05477 return do_sched_setscheduler(pid, policy, param);
05478 }
05479
05480
05481
05482
05483
05484
05485 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
05486 {
05487 return do_sched_setscheduler(pid, -1, param);
05488 }
05489
05490
05491
05492
05493
05494 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
05495 {
05496 struct task_struct *p;
05497 int retval;
05498
05499 if (pid < 0)
05500 return -EINVAL;
05501
05502 retval = -ESRCH;
05503 read_lock(&tasklist_lock);
05504 p = find_process_by_pid(pid);
05505 if (p) {
05506 retval = security_task_getscheduler(p);
05507 if (!retval)
05508 retval = p->policy;
05509 }
05510 read_unlock(&tasklist_lock);
05511 return retval;
05512 }
05513
05514
05515
05516
05517
05518
05519 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
05520 {
05521 struct sched_param lp;
05522 struct task_struct *p;
05523 int retval;
05524
05525 if (!param || pid < 0)
05526 return -EINVAL;
05527
05528 read_lock(&tasklist_lock);
05529 p = find_process_by_pid(pid);
05530 retval = -ESRCH;
05531 if (!p)
05532 goto out_unlock;
05533
05534 retval = security_task_getscheduler(p);
05535 if (retval)
05536 goto out_unlock;
05537
05538 lp.sched_priority = p->rt_priority;
05539 read_unlock(&tasklist_lock);
05540
05541
05542
05543
05544 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
05545
05546 return retval;
05547
05548 out_unlock:
05549 read_unlock(&tasklist_lock);
05550 return retval;
05551 }
05552
05553 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
05554 {
05555 cpumask_var_t cpus_allowed, new_mask;
05556 struct task_struct *p;
05557 int retval;
05558
05559 get_online_cpus();
05560 read_lock(&tasklist_lock);
05561
05562 p = find_process_by_pid(pid);
05563 if (!p) {
05564 read_unlock(&tasklist_lock);
05565 put_online_cpus();
05566 return -ESRCH;
05567 }
05568
05569
05570
05571
05572
05573
05574 get_task_struct(p);
05575 read_unlock(&tasklist_lock);
05576
05577 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
05578 retval = -ENOMEM;
05579 goto out_put_task;
05580 }
05581 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
05582 retval = -ENOMEM;
05583 goto out_free_cpus_allowed;
05584 }
05585 retval = -EPERM;
05586 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
05587 goto out_unlock;
05588
05589 retval = security_task_setscheduler(p, 0, NULL);
05590 if (retval)
05591 goto out_unlock;
05592
05593 cpuset_cpus_allowed(p, cpus_allowed);
05594 cpumask_and(new_mask, in_mask, cpus_allowed);
05595 again:
05596 retval = set_cpus_allowed_ptr(p, new_mask);
05597
05598 if (!retval) {
05599 cpuset_cpus_allowed(p, cpus_allowed);
05600 if (!cpumask_subset(new_mask, cpus_allowed)) {
05601
05602
05603
05604
05605
05606 cpumask_copy(new_mask, cpus_allowed);
05607 goto again;
05608 }
05609 }
05610 out_unlock:
05611 free_cpumask_var(new_mask);
05612 out_free_cpus_allowed:
05613 free_cpumask_var(cpus_allowed);
05614 out_put_task:
05615 put_task_struct(p);
05616 put_online_cpus();
05617 return retval;
05618 }
05619
05620 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
05621 struct cpumask *new_mask)
05622 {
05623 if (len < cpumask_size())
05624 cpumask_clear(new_mask);
05625 else if (len > cpumask_size())
05626 len = cpumask_size();
05627
05628 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
05629 }
05630
05631
05632
05633
05634
05635
05636
05637 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
05638 unsigned long __user *, user_mask_ptr)
05639 {
05640 cpumask_var_t new_mask;
05641 int retval;
05642
05643 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
05644 return -ENOMEM;
05645
05646 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
05647 if (retval == 0)
05648 retval = sched_setaffinity(pid, new_mask);
05649 free_cpumask_var(new_mask);
05650 return retval;
05651 }
05652
05653 long sched_getaffinity(pid_t pid, struct cpumask *mask)
05654 {
05655 struct task_struct *p;
05656 int retval;
05657
05658 get_online_cpus();
05659 read_lock(&tasklist_lock);
05660
05661 retval = -ESRCH;
05662 p = find_process_by_pid(pid);
05663 if (!p)
05664 goto out_unlock;
05665
05666 retval = security_task_getscheduler(p);
05667 if (retval)
05668 goto out_unlock;
05669
05670 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
05671
05672 out_unlock:
05673 read_unlock(&tasklist_lock);
05674 put_online_cpus();
05675
05676 return retval;
05677 }
05678
05679
05680
05681
05682
05683
05684
05685 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
05686 unsigned long __user *, user_mask_ptr)
05687 {
05688 int ret;
05689 cpumask_var_t mask;
05690
05691 if (len < cpumask_size())
05692 return -EINVAL;
05693
05694 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
05695 return -ENOMEM;
05696
05697 ret = sched_getaffinity(pid, mask);
05698 if (ret == 0) {
05699 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
05700 ret = -EFAULT;
05701 else
05702 ret = cpumask_size();
05703 }
05704 free_cpumask_var(mask);
05705
05706 return ret;
05707 }
05708
05709
05710
05711
05712
05713
05714
05715 SYSCALL_DEFINE0(sched_yield)
05716 {
05717 struct rq *rq = this_rq_lock();
05718
05719 schedstat_inc(rq, yld_count);
05720 current->sched_class->yield_task(rq);
05721
05722
05723
05724
05725
05726 __release(rq->lock);
05727 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
05728 _raw_spin_unlock(&rq->lock);
05729 preempt_enable_no_resched();
05730
05731 schedule();
05732
05733 return 0;
05734 }
05735 #endif
05736
05737 static void __cond_resched(void)
05738 {
05739 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
05740 __might_sleep(__FILE__, __LINE__);
05741 #endif
05742
05743
05744
05745
05746
05747 do {
05748 add_preempt_count(PREEMPT_ACTIVE);
05749 schedule();
05750 sub_preempt_count(PREEMPT_ACTIVE);
05751 } while (need_resched());
05752 }
05753
05754 int __sched _cond_resched(void)
05755 {
05756 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
05757 system_state == SYSTEM_RUNNING) {
05758 __cond_resched();
05759 return 1;
05760 }
05761 return 0;
05762 }
05763 EXPORT_SYMBOL(_cond_resched);
05764
05765
05766
05767
05768
05769
05770
05771
05772
05773 int cond_resched_lock(spinlock_t *lock)
05774 {
05775 int resched = need_resched() && system_state == SYSTEM_RUNNING;
05776 int ret = 0;
05777
05778 if (spin_needbreak(lock) || resched) {
05779 spin_unlock(lock);
05780 if (resched && need_resched())
05781 __cond_resched();
05782 else
05783 cpu_relax();
05784 ret = 1;
05785 spin_lock(lock);
05786 }
05787 return ret;
05788 }
05789 EXPORT_SYMBOL(cond_resched_lock);
05790
05791 int __sched cond_resched_softirq(void)
05792 {
05793 BUG_ON(!in_softirq());
05794
05795 if (need_resched() && system_state == SYSTEM_RUNNING) {
05796 local_bh_enable();
05797 __cond_resched();
05798 local_bh_disable();
05799 return 1;
05800 }
05801 return 0;
05802 }
05803 EXPORT_SYMBOL(cond_resched_softirq);
05804
05805 #ifndef DDE_LINUX
05806
05807
05808
05809
05810
05811
05812 void __sched yield(void)
05813 {
05814 set_current_state(TASK_RUNNING);
05815 sys_sched_yield();
05816 }
05817 EXPORT_SYMBOL(yield);
05818
05819
05820
05821
05822
05823
05824
05825
05826 void __sched io_schedule(void)
05827 {
05828 struct rq *rq = &__raw_get_cpu_var(runqueues);
05829
05830 delayacct_blkio_start();
05831 atomic_inc(&rq->nr_iowait);
05832 schedule();
05833 atomic_dec(&rq->nr_iowait);
05834 delayacct_blkio_end();
05835 }
05836 EXPORT_SYMBOL(io_schedule);
05837
05838 long __sched io_schedule_timeout(long timeout)
05839 {
05840 struct rq *rq = &__raw_get_cpu_var(runqueues);
05841 long ret;
05842
05843 delayacct_blkio_start();
05844 atomic_inc(&rq->nr_iowait);
05845 ret = schedule_timeout(timeout);
05846 atomic_dec(&rq->nr_iowait);
05847 delayacct_blkio_end();
05848 return ret;
05849 }
05850
05851
05852
05853
05854
05855
05856
05857
05858 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
05859 {
05860 int ret = -EINVAL;
05861
05862 switch (policy) {
05863 case SCHED_FIFO:
05864 case SCHED_RR:
05865 ret = MAX_USER_RT_PRIO-1;
05866 break;
05867 case SCHED_NORMAL:
05868 case SCHED_BATCH:
05869 case SCHED_IDLE:
05870 ret = 0;
05871 break;
05872 }
05873 return ret;
05874 }
05875
05876
05877
05878
05879
05880
05881
05882
05883 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
05884 {
05885 int ret = -EINVAL;
05886
05887 switch (policy) {
05888 case SCHED_FIFO:
05889 case SCHED_RR:
05890 ret = 1;
05891 break;
05892 case SCHED_NORMAL:
05893 case SCHED_BATCH:
05894 case SCHED_IDLE:
05895 ret = 0;
05896 }
05897 return ret;
05898 }
05899
05900
05901
05902
05903
05904
05905
05906
05907
05908 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
05909 struct timespec __user *, interval)
05910 {
05911 struct task_struct *p;
05912 unsigned int time_slice;
05913 int retval;
05914 struct timespec t;
05915
05916 if (pid < 0)
05917 return -EINVAL;
05918
05919 retval = -ESRCH;
05920 read_lock(&tasklist_lock);
05921 p = find_process_by_pid(pid);
05922 if (!p)
05923 goto out_unlock;
05924
05925 retval = security_task_getscheduler(p);
05926 if (retval)
05927 goto out_unlock;
05928
05929
05930
05931
05932
05933 time_slice = 0;
05934 if (p->policy == SCHED_RR) {
05935 time_slice = DEF_TIMESLICE;
05936 } else if (p->policy != SCHED_FIFO) {
05937 struct sched_entity *se = &p->se;
05938 unsigned long flags;
05939 struct rq *rq;
05940
05941 rq = task_rq_lock(p, &flags);
05942 if (rq->cfs.load.weight)
05943 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
05944 task_rq_unlock(rq, &flags);
05945 }
05946 read_unlock(&tasklist_lock);
05947 jiffies_to_timespec(time_slice, &t);
05948 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
05949 return retval;
05950
05951 out_unlock:
05952 read_unlock(&tasklist_lock);
05953 return retval;
05954 }
05955
05956 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
05957
05958 void sched_show_task(struct task_struct *p)
05959 {
05960 unsigned long free = 0;
05961 unsigned state;
05962
05963 state = p->state ? __ffs(p->state) + 1 : 0;
05964 printk(KERN_INFO "%-13.13s %c", p->comm,
05965 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
05966 #if BITS_PER_LONG == 32
05967 if (state == TASK_RUNNING)
05968 printk(KERN_CONT " running ");
05969 else
05970 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
05971 #else
05972 if (state == TASK_RUNNING)
05973 printk(KERN_CONT " running task ");
05974 else
05975 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
05976 #endif
05977 #ifdef CONFIG_DEBUG_STACK_USAGE
05978 {
05979 unsigned long *n = end_of_stack(p);
05980 while (!*n)
05981 n++;
05982 free = (unsigned long)n - (unsigned long)end_of_stack(p);
05983 }
05984 #endif
05985 printk(KERN_CONT "%5lu %5d %6d\n", free,
05986 task_pid_nr(p), task_pid_nr(p->real_parent));
05987
05988 show_stack(p, NULL);
05989 }
05990
05991 void show_state_filter(unsigned long state_filter)
05992 {
05993 struct task_struct *g, *p;
05994
05995 #if BITS_PER_LONG == 32
05996 printk(KERN_INFO
05997 " task PC stack pid father\n");
05998 #else
05999 printk(KERN_INFO
06000 " task PC stack pid father\n");
06001 #endif
06002 read_lock(&tasklist_lock);
06003 do_each_thread(g, p) {
06004
06005
06006
06007
06008 touch_nmi_watchdog();
06009 if (!state_filter || (p->state & state_filter))
06010 sched_show_task(p);
06011 } while_each_thread(g, p);
06012
06013 touch_all_softlockup_watchdogs();
06014
06015 #ifdef CONFIG_SCHED_DEBUG
06016 sysrq_sched_debug_show();
06017 #endif
06018 read_unlock(&tasklist_lock);
06019
06020
06021
06022 if (state_filter == -1)
06023 debug_show_all_locks();
06024 }
06025
06026 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
06027 {
06028 idle->sched_class = &idle_sched_class;
06029 }
06030
06031
06032
06033
06034
06035
06036
06037
06038
06039 void __cpuinit init_idle(struct task_struct *idle, int cpu)
06040 {
06041 struct rq *rq = cpu_rq(cpu);
06042 unsigned long flags;
06043
06044 spin_lock_irqsave(&rq->lock, flags);
06045
06046 __sched_fork(idle);
06047 idle->se.exec_start = sched_clock();
06048
06049 idle->prio = idle->normal_prio = MAX_PRIO;
06050 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
06051 __set_task_cpu(idle, cpu);
06052
06053 rq->curr = rq->idle = idle;
06054 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
06055 idle->oncpu = 1;
06056 #endif
06057 spin_unlock_irqrestore(&rq->lock, flags);
06058
06059
06060 #if defined(CONFIG_PREEMPT)
06061 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
06062 #else
06063 task_thread_info(idle)->preempt_count = 0;
06064 #endif
06065
06066
06067
06068 idle->sched_class = &idle_sched_class;
06069 ftrace_graph_init_task(idle);
06070 }
06071 #endif
06072
06073
06074
06075
06076
06077
06078
06079
06080 cpumask_var_t nohz_cpu_mask;
06081
06082 #ifndef DDE_LINUX
06083
06084
06085
06086
06087
06088
06089
06090
06091
06092 static inline void sched_init_granularity(void)
06093 {
06094 unsigned int factor = 1 + ilog2(num_online_cpus());
06095 const unsigned long limit = 200000000;
06096
06097 sysctl_sched_min_granularity *= factor;
06098 if (sysctl_sched_min_granularity > limit)
06099 sysctl_sched_min_granularity = limit;
06100
06101 sysctl_sched_latency *= factor;
06102 if (sysctl_sched_latency > limit)
06103 sysctl_sched_latency = limit;
06104
06105 sysctl_sched_wakeup_granularity *= factor;
06106
06107 sysctl_sched_shares_ratelimit *= factor;
06108 }
06109
06110 #ifdef CONFIG_SMP
06111
06112
06113
06114
06115
06116
06117
06118
06119
06120
06121
06122
06123
06124
06125
06126
06127
06128
06129
06130
06131
06132
06133
06134
06135
06136 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
06137 {
06138 struct migration_req req;
06139 unsigned long flags;
06140 struct rq *rq;
06141 int ret = 0;
06142
06143 rq = task_rq_lock(p, &flags);
06144 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
06145 ret = -EINVAL;
06146 goto out;
06147 }
06148
06149 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
06150 !cpumask_equal(&p->cpus_allowed, new_mask))) {
06151 ret = -EINVAL;
06152 goto out;
06153 }
06154
06155 if (p->sched_class->set_cpus_allowed)
06156 p->sched_class->set_cpus_allowed(p, new_mask);
06157 else {
06158 cpumask_copy(&p->cpus_allowed, new_mask);
06159 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
06160 }
06161
06162
06163 if (cpumask_test_cpu(task_cpu(p), new_mask))
06164 goto out;
06165
06166 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
06167
06168 task_rq_unlock(rq, &flags);
06169 wake_up_process(rq->migration_thread);
06170 wait_for_completion(&req.done);
06171 tlb_migrate_finish(p->mm);
06172 return 0;
06173 }
06174 out:
06175 task_rq_unlock(rq, &flags);
06176
06177 return ret;
06178 }
06179 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
06180
06181
06182
06183
06184
06185
06186
06187
06188
06189
06190
06191
06192 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
06193 {
06194 struct rq *rq_dest, *rq_src;
06195 int ret = 0, on_rq;
06196
06197 if (unlikely(!cpu_active(dest_cpu)))
06198 return ret;
06199
06200 rq_src = cpu_rq(src_cpu);
06201 rq_dest = cpu_rq(dest_cpu);
06202
06203 double_rq_lock(rq_src, rq_dest);
06204
06205 if (task_cpu(p) != src_cpu)
06206 goto done;
06207
06208 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
06209 goto fail;
06210
06211 on_rq = p->se.on_rq;
06212 if (on_rq)
06213 deactivate_task(rq_src, p, 0);
06214
06215 set_task_cpu(p, dest_cpu);
06216 if (on_rq) {
06217 activate_task(rq_dest, p, 0);
06218 check_preempt_curr(rq_dest, p, 0);
06219 }
06220 done:
06221 ret = 1;
06222 fail:
06223 double_rq_unlock(rq_src, rq_dest);
06224 return ret;
06225 }
06226
06227
06228
06229
06230
06231
06232 static int migration_thread(void *data)
06233 {
06234 int cpu = (long)data;
06235 struct rq *rq;
06236
06237 rq = cpu_rq(cpu);
06238 BUG_ON(rq->migration_thread != current);
06239
06240 set_current_state(TASK_INTERRUPTIBLE);
06241 while (!kthread_should_stop()) {
06242 struct migration_req *req;
06243 struct list_head *head;
06244
06245 spin_lock_irq(&rq->lock);
06246
06247 if (cpu_is_offline(cpu)) {
06248 spin_unlock_irq(&rq->lock);
06249 goto wait_to_die;
06250 }
06251
06252 if (rq->active_balance) {
06253 active_load_balance(rq, cpu);
06254 rq->active_balance = 0;
06255 }
06256
06257 head = &rq->migration_queue;
06258
06259 if (list_empty(head)) {
06260 spin_unlock_irq(&rq->lock);
06261 schedule();
06262 set_current_state(TASK_INTERRUPTIBLE);
06263 continue;
06264 }
06265 req = list_entry(head->next, struct migration_req, list);
06266 list_del_init(head->next);
06267
06268 spin_unlock(&rq->lock);
06269 __migrate_task(req->task, cpu, req->dest_cpu);
06270 local_irq_enable();
06271
06272 complete(&req->done);
06273 }
06274 __set_current_state(TASK_RUNNING);
06275 return 0;
06276
06277 wait_to_die:
06278
06279 set_current_state(TASK_INTERRUPTIBLE);
06280 while (!kthread_should_stop()) {
06281 schedule();
06282 set_current_state(TASK_INTERRUPTIBLE);
06283 }
06284 __set_current_state(TASK_RUNNING);
06285 return 0;
06286 }
06287
06288 #ifdef CONFIG_HOTPLUG_CPU
06289
06290 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
06291 {
06292 int ret;
06293
06294 local_irq_disable();
06295 ret = __migrate_task(p, src_cpu, dest_cpu);
06296 local_irq_enable();
06297 return ret;
06298 }
06299
06300
06301
06302
06303 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
06304 {
06305 int dest_cpu;
06306 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
06307
06308 again:
06309
06310 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
06311 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
06312 goto move;
06313
06314
06315 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
06316 if (dest_cpu < nr_cpu_ids)
06317 goto move;
06318
06319
06320 if (dest_cpu >= nr_cpu_ids) {
06321 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
06322 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
06323
06324
06325
06326
06327
06328
06329 if (p->mm && printk_ratelimit()) {
06330 printk(KERN_INFO "process %d (%s) no "
06331 "longer affine to cpu%d\n",
06332 task_pid_nr(p), p->comm, dead_cpu);
06333 }
06334 }
06335
06336 move:
06337
06338 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
06339 goto again;
06340 }
06341
06342
06343
06344
06345
06346
06347
06348
06349 static void migrate_nr_uninterruptible(struct rq *rq_src)
06350 {
06351 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
06352 unsigned long flags;
06353
06354 local_irq_save(flags);
06355 double_rq_lock(rq_src, rq_dest);
06356 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
06357 rq_src->nr_uninterruptible = 0;
06358 double_rq_unlock(rq_src, rq_dest);
06359 local_irq_restore(flags);
06360 }
06361
06362
06363 static void migrate_live_tasks(int src_cpu)
06364 {
06365 struct task_struct *p, *t;
06366
06367 read_lock(&tasklist_lock);
06368
06369 do_each_thread(t, p) {
06370 if (p == current)
06371 continue;
06372
06373 if (task_cpu(p) == src_cpu)
06374 move_task_off_dead_cpu(src_cpu, p);
06375 } while_each_thread(t, p);
06376
06377 read_unlock(&tasklist_lock);
06378 }
06379
06380
06381
06382
06383
06384
06385 void sched_idle_next(void)
06386 {
06387 int this_cpu = smp_processor_id();
06388 struct rq *rq = cpu_rq(this_cpu);
06389 struct task_struct *p = rq->idle;
06390 unsigned long flags;
06391
06392
06393 BUG_ON(cpu_online(this_cpu));
06394
06395
06396
06397
06398
06399 spin_lock_irqsave(&rq->lock, flags);
06400
06401 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
06402
06403 update_rq_clock(rq);
06404 activate_task(rq, p, 0);
06405
06406 spin_unlock_irqrestore(&rq->lock, flags);
06407 }
06408
06409
06410
06411
06412
06413 void idle_task_exit(void)
06414 {
06415 struct mm_struct *mm = current->active_mm;
06416
06417 BUG_ON(cpu_online(smp_processor_id()));
06418
06419 if (mm != &init_mm)
06420 switch_mm(mm, &init_mm, current);
06421 mmdrop(mm);
06422 }
06423
06424
06425 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
06426 {
06427 struct rq *rq = cpu_rq(dead_cpu);
06428
06429
06430 BUG_ON(!p->exit_state);
06431
06432
06433 BUG_ON(p->state == TASK_DEAD);
06434
06435 get_task_struct(p);
06436
06437
06438
06439
06440
06441
06442 spin_unlock_irq(&rq->lock);
06443 move_task_off_dead_cpu(dead_cpu, p);
06444 spin_lock_irq(&rq->lock);
06445
06446 put_task_struct(p);
06447 }
06448
06449
06450 static void migrate_dead_tasks(unsigned int dead_cpu)
06451 {
06452 struct rq *rq = cpu_rq(dead_cpu);
06453 struct task_struct *next;
06454
06455 for ( ; ; ) {
06456 if (!rq->nr_running)
06457 break;
06458 update_rq_clock(rq);
06459 next = pick_next_task(rq, rq->curr);
06460 if (!next)
06461 break;
06462 next->sched_class->put_prev_task(rq, next);
06463 migrate_dead(dead_cpu, next);
06464
06465 }
06466 }
06467 #endif
06468
06469 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
06470
06471 static struct ctl_table sd_ctl_dir[] = {
06472 {
06473 .procname = "sched_domain",
06474 .mode = 0555,
06475 },
06476 {0, },
06477 };
06478
06479 static struct ctl_table sd_ctl_root[] = {
06480 {
06481 .ctl_name = CTL_KERN,
06482 .procname = "kernel",
06483 .mode = 0555,
06484 .child = sd_ctl_dir,
06485 },
06486 {0, },
06487 };
06488
06489 static struct ctl_table *sd_alloc_ctl_entry(int n)
06490 {
06491 struct ctl_table *entry =
06492 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
06493
06494 return entry;
06495 }
06496
06497 static void sd_free_ctl_entry(struct ctl_table **tablep)
06498 {
06499 struct ctl_table *entry;
06500
06501
06502
06503
06504
06505
06506
06507 for (entry = *tablep; entry->mode; entry++) {
06508 if (entry->child)
06509 sd_free_ctl_entry(&entry->child);
06510 if (entry->proc_handler == NULL)
06511 kfree(entry->procname);
06512 }
06513
06514 kfree(*tablep);
06515 *tablep = NULL;
06516 }
06517
06518 static void
06519 set_table_entry(struct ctl_table *entry,
06520 const char *procname, void *data, int maxlen,
06521 mode_t mode, proc_handler *proc_handler)
06522 {
06523 entry->procname = procname;
06524 entry->data = data;
06525 entry->maxlen = maxlen;
06526 entry->mode = mode;
06527 entry->proc_handler = proc_handler;
06528 }
06529
06530 static struct ctl_table *
06531 sd_alloc_ctl_domain_table(struct sched_domain *sd)
06532 {
06533 struct ctl_table *table = sd_alloc_ctl_entry(13);
06534
06535 if (table == NULL)
06536 return NULL;
06537
06538 set_table_entry(&table[0], "min_interval", &sd->min_interval,
06539 sizeof(long), 0644, proc_doulongvec_minmax);
06540 set_table_entry(&table[1], "max_interval", &sd->max_interval,
06541 sizeof(long), 0644, proc_doulongvec_minmax);
06542 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
06543 sizeof(int), 0644, proc_dointvec_minmax);
06544 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
06545 sizeof(int), 0644, proc_dointvec_minmax);
06546 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
06547 sizeof(int), 0644, proc_dointvec_minmax);
06548 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
06549 sizeof(int), 0644, proc_dointvec_minmax);
06550 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
06551 sizeof(int), 0644, proc_dointvec_minmax);
06552 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
06553 sizeof(int), 0644, proc_dointvec_minmax);
06554 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
06555 sizeof(int), 0644, proc_dointvec_minmax);
06556 set_table_entry(&table[9], "cache_nice_tries",
06557 &sd->cache_nice_tries,
06558 sizeof(int), 0644, proc_dointvec_minmax);
06559 set_table_entry(&table[10], "flags", &sd->flags,
06560 sizeof(int), 0644, proc_dointvec_minmax);
06561 set_table_entry(&table[11], "name", sd->name,
06562 CORENAME_MAX_SIZE, 0444, proc_dostring);
06563
06564
06565 return table;
06566 }
06567
06568 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
06569 {
06570 struct ctl_table *entry, *table;
06571 struct sched_domain *sd;
06572 int domain_num = 0, i;
06573 char buf[32];
06574
06575 for_each_domain(cpu, sd)
06576 domain_num++;
06577 entry = table = sd_alloc_ctl_entry(domain_num + 1);
06578 if (table == NULL)
06579 return NULL;
06580
06581 i = 0;
06582 for_each_domain(cpu, sd) {
06583 snprintf(buf, 32, "domain%d", i);
06584 entry->procname = kstrdup(buf, GFP_KERNEL);
06585 entry->mode = 0555;
06586 entry->child = sd_alloc_ctl_domain_table(sd);
06587 entry++;
06588 i++;
06589 }
06590 return table;
06591 }
06592
06593 static struct ctl_table_header *sd_sysctl_header;
06594 static void register_sched_domain_sysctl(void)
06595 {
06596 int i, cpu_num = num_online_cpus();
06597 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
06598 char buf[32];
06599
06600 WARN_ON(sd_ctl_dir[0].child);
06601 sd_ctl_dir[0].child = entry;
06602
06603 if (entry == NULL)
06604 return;
06605
06606 for_each_online_cpu(i) {
06607 snprintf(buf, 32, "cpu%d", i);
06608 entry->procname = kstrdup(buf, GFP_KERNEL);
06609 entry->mode = 0555;
06610 entry->child = sd_alloc_ctl_cpu_table(i);
06611 entry++;
06612 }
06613
06614 WARN_ON(sd_sysctl_header);
06615 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
06616 }
06617
06618
06619 static void unregister_sched_domain_sysctl(void)
06620 {
06621 if (sd_sysctl_header)
06622 unregister_sysctl_table(sd_sysctl_header);
06623 sd_sysctl_header = NULL;
06624 if (sd_ctl_dir[0].child)
06625 sd_free_ctl_entry(&sd_ctl_dir[0].child);
06626 }
06627 #else
06628 static void register_sched_domain_sysctl(void)
06629 {
06630 }
06631 static void unregister_sched_domain_sysctl(void)
06632 {
06633 }
06634 #endif
06635
06636 static void set_rq_online(struct rq *rq)
06637 {
06638 if (!rq->online) {
06639 const struct sched_class *class;
06640
06641 cpumask_set_cpu(rq->cpu, rq->rd->online);
06642 rq->online = 1;
06643
06644 for_each_class(class) {
06645 if (class->rq_online)
06646 class->rq_online(rq);
06647 }
06648 }
06649 }
06650
06651 static void set_rq_offline(struct rq *rq)
06652 {
06653 if (rq->online) {
06654 const struct sched_class *class;
06655
06656 for_each_class(class) {
06657 if (class->rq_offline)
06658 class->rq_offline(rq);
06659 }
06660
06661 cpumask_clear_cpu(rq->cpu, rq->rd->online);
06662 rq->online = 0;
06663 }
06664 }
06665
06666
06667
06668
06669
06670 static int __cpuinit
06671 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
06672 {
06673 struct task_struct *p;
06674 int cpu = (long)hcpu;
06675 unsigned long flags;
06676 struct rq *rq;
06677
06678 switch (action) {
06679
06680 case CPU_UP_PREPARE:
06681 case CPU_UP_PREPARE_FROZEN:
06682 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
06683 if (IS_ERR(p))
06684 return NOTIFY_BAD;
06685 kthread_bind(p, cpu);
06686
06687 rq = task_rq_lock(p, &flags);
06688 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
06689 task_rq_unlock(rq, &flags);
06690 cpu_rq(cpu)->migration_thread = p;
06691 break;
06692
06693 case CPU_ONLINE:
06694 case CPU_ONLINE_FROZEN:
06695
06696 wake_up_process(cpu_rq(cpu)->migration_thread);
06697
06698
06699 rq = cpu_rq(cpu);
06700 spin_lock_irqsave(&rq->lock, flags);
06701 if (rq->rd) {
06702 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
06703
06704 set_rq_online(rq);
06705 }
06706 spin_unlock_irqrestore(&rq->lock, flags);
06707 break;
06708
06709 #ifdef CONFIG_HOTPLUG_CPU
06710 case CPU_UP_CANCELED:
06711 case CPU_UP_CANCELED_FROZEN:
06712 if (!cpu_rq(cpu)->migration_thread)
06713 break;
06714
06715 kthread_bind(cpu_rq(cpu)->migration_thread,
06716 cpumask_any(cpu_online_mask));
06717 kthread_stop(cpu_rq(cpu)->migration_thread);
06718 cpu_rq(cpu)->migration_thread = NULL;
06719 break;
06720
06721 case CPU_DEAD:
06722 case CPU_DEAD_FROZEN:
06723 cpuset_lock();
06724 migrate_live_tasks(cpu);
06725 rq = cpu_rq(cpu);
06726 kthread_stop(rq->migration_thread);
06727 rq->migration_thread = NULL;
06728
06729 spin_lock_irq(&rq->lock);
06730 update_rq_clock(rq);
06731 deactivate_task(rq, rq->idle, 0);
06732 rq->idle->static_prio = MAX_PRIO;
06733 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
06734 rq->idle->sched_class = &idle_sched_class;
06735 migrate_dead_tasks(cpu);
06736 spin_unlock_irq(&rq->lock);
06737 cpuset_unlock();
06738 migrate_nr_uninterruptible(rq);
06739 BUG_ON(rq->nr_running != 0);
06740
06741
06742
06743
06744
06745
06746 spin_lock_irq(&rq->lock);
06747 while (!list_empty(&rq->migration_queue)) {
06748 struct migration_req *req;
06749
06750 req = list_entry(rq->migration_queue.next,
06751 struct migration_req, list);
06752 list_del_init(&req->list);
06753 spin_unlock_irq(&rq->lock);
06754 complete(&req->done);
06755 spin_lock_irq(&rq->lock);
06756 }
06757 spin_unlock_irq(&rq->lock);
06758 break;
06759
06760 case CPU_DYING:
06761 case CPU_DYING_FROZEN:
06762
06763 rq = cpu_rq(cpu);
06764 spin_lock_irqsave(&rq->lock, flags);
06765 if (rq->rd) {
06766 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
06767 set_rq_offline(rq);
06768 }
06769 spin_unlock_irqrestore(&rq->lock, flags);
06770 break;
06771 #endif
06772 }
06773 return NOTIFY_OK;
06774 }
06775
06776
06777
06778
06779 static struct notifier_block __cpuinitdata migration_notifier = {
06780 .notifier_call = migration_call,
06781 .priority = 10
06782 };
06783
06784 static int __init migration_init(void)
06785 {
06786 void *cpu = (void *)(long)smp_processor_id();
06787 int err;
06788
06789
06790 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
06791 BUG_ON(err == NOTIFY_BAD);
06792 migration_call(&migration_notifier, CPU_ONLINE, cpu);
06793 register_cpu_notifier(&migration_notifier);
06794
06795 return err;
06796 }
06797 early_initcall(migration_init);
06798 #endif
06799
06800 #ifdef CONFIG_SMP
06801
06802 #ifdef CONFIG_SCHED_DEBUG
06803
06804 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
06805 struct cpumask *groupmask)
06806 {
06807 struct sched_group *group = sd->groups;
06808 char str[256];
06809
06810 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
06811 cpumask_clear(groupmask);
06812
06813 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
06814
06815 if (!(sd->flags & SD_LOAD_BALANCE)) {
06816 printk("does not load-balance\n");
06817 if (sd->parent)
06818 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
06819 " has parent");
06820 return -1;
06821 }
06822
06823 printk(KERN_CONT "span %s level %s\n", str, sd->name);
06824
06825 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
06826 printk(KERN_ERR "ERROR: domain->span does not contain "
06827 "CPU%d\n", cpu);
06828 }
06829 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
06830 printk(KERN_ERR "ERROR: domain->groups does not contain"
06831 " CPU%d\n", cpu);
06832 }
06833
06834 printk(KERN_DEBUG "%*s groups:", level + 1, "");
06835 do {
06836 if (!group) {
06837 printk("\n");
06838 printk(KERN_ERR "ERROR: group is NULL\n");
06839 break;
06840 }
06841
06842 if (!group->__cpu_power) {
06843 printk(KERN_CONT "\n");
06844 printk(KERN_ERR "ERROR: domain->cpu_power not "
06845 "set\n");
06846 break;
06847 }
06848
06849 if (!cpumask_weight(sched_group_cpus(group))) {
06850 printk(KERN_CONT "\n");
06851 printk(KERN_ERR "ERROR: empty group\n");
06852 break;
06853 }
06854
06855 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
06856 printk(KERN_CONT "\n");
06857 printk(KERN_ERR "ERROR: repeated CPUs\n");
06858 break;
06859 }
06860
06861 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
06862
06863 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
06864 printk(KERN_CONT " %s", str);
06865
06866 group = group->next;
06867 } while (group != sd->groups);
06868 printk(KERN_CONT "\n");
06869
06870 if (!cpumask_equal(sched_domain_span(sd), groupmask))
06871 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
06872
06873 if (sd->parent &&
06874 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
06875 printk(KERN_ERR "ERROR: parent span is not a superset "
06876 "of domain->span\n");
06877 return 0;
06878 }
06879
06880 static void sched_domain_debug(struct sched_domain *sd, int cpu)
06881 {
06882 cpumask_var_t groupmask;
06883 int level = 0;
06884
06885 if (!sd) {
06886 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
06887 return;
06888 }
06889
06890 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
06891
06892 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
06893 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
06894 return;
06895 }
06896
06897 for (;;) {
06898 if (sched_domain_debug_one(sd, cpu, level, groupmask))
06899 break;
06900 level++;
06901 sd = sd->parent;
06902 if (!sd)
06903 break;
06904 }
06905 free_cpumask_var(groupmask);
06906 }
06907 #else
06908 # define sched_domain_debug(sd, cpu) do { } while (0)
06909 #endif
06910
06911 static int sd_degenerate(struct sched_domain *sd)
06912 {
06913 if (cpumask_weight(sched_domain_span(sd)) == 1)
06914 return 1;
06915
06916
06917 if (sd->flags & (SD_LOAD_BALANCE |
06918 SD_BALANCE_NEWIDLE |
06919 SD_BALANCE_FORK |
06920 SD_BALANCE_EXEC |
06921 SD_SHARE_CPUPOWER |
06922 SD_SHARE_PKG_RESOURCES)) {
06923 if (sd->groups != sd->groups->next)
06924 return 0;
06925 }
06926
06927
06928 if (sd->flags & (SD_WAKE_IDLE |
06929 SD_WAKE_AFFINE |
06930 SD_WAKE_BALANCE))
06931 return 0;
06932
06933 return 1;
06934 }
06935
06936 static int
06937 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
06938 {
06939 unsigned long cflags = sd->flags, pflags = parent->flags;
06940
06941 if (sd_degenerate(parent))
06942 return 1;
06943
06944 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
06945 return 0;
06946
06947
06948
06949 if (cflags & SD_WAKE_AFFINE)
06950 pflags &= ~SD_WAKE_BALANCE;
06951
06952 if (parent->groups == parent->groups->next) {
06953 pflags &= ~(SD_LOAD_BALANCE |
06954 SD_BALANCE_NEWIDLE |
06955 SD_BALANCE_FORK |
06956 SD_BALANCE_EXEC |
06957 SD_SHARE_CPUPOWER |
06958 SD_SHARE_PKG_RESOURCES);
06959 if (nr_node_ids == 1)
06960 pflags &= ~SD_SERIALIZE;
06961 }
06962 if (~cflags & pflags)
06963 return 0;
06964
06965 return 1;
06966 }
06967
06968 static void free_rootdomain(struct root_domain *rd)
06969 {
06970 cpupri_cleanup(&rd->cpupri);
06971
06972 free_cpumask_var(rd->rto_mask);
06973 free_cpumask_var(rd->online);
06974 free_cpumask_var(rd->span);
06975 kfree(rd);
06976 }
06977
06978 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
06979 {
06980 struct root_domain *old_rd = NULL;
06981 unsigned long flags;
06982
06983 spin_lock_irqsave(&rq->lock, flags);
06984
06985 if (rq->rd) {
06986 old_rd = rq->rd;
06987
06988 if (cpumask_test_cpu(rq->cpu, old_rd->online))
06989 set_rq_offline(rq);
06990
06991 cpumask_clear_cpu(rq->cpu, old_rd->span);
06992
06993
06994
06995
06996
06997
06998 if (!atomic_dec_and_test(&old_rd->refcount))
06999 old_rd = NULL;
07000 }
07001
07002 atomic_inc(&rd->refcount);
07003 rq->rd = rd;
07004
07005 cpumask_set_cpu(rq->cpu, rd->span);
07006 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
07007 set_rq_online(rq);
07008
07009 spin_unlock_irqrestore(&rq->lock, flags);
07010
07011 if (old_rd)
07012 free_rootdomain(old_rd);
07013 }
07014
07015 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
07016 {
07017 memset(rd, 0, sizeof(*rd));
07018
07019 if (bootmem) {
07020 alloc_bootmem_cpumask_var(&def_root_domain.span);
07021 alloc_bootmem_cpumask_var(&def_root_domain.online);
07022 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
07023 cpupri_init(&rd->cpupri, true);
07024 return 0;
07025 }
07026
07027 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
07028 goto out;
07029 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
07030 goto free_span;
07031 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
07032 goto free_online;
07033
07034 if (cpupri_init(&rd->cpupri, false) != 0)
07035 goto free_rto_mask;
07036 return 0;
07037
07038 free_rto_mask:
07039 free_cpumask_var(rd->rto_mask);
07040 free_online:
07041 free_cpumask_var(rd->online);
07042 free_span:
07043 free_cpumask_var(rd->span);
07044 out:
07045 return -ENOMEM;
07046 }
07047
07048 static void init_defrootdomain(void)
07049 {
07050 init_rootdomain(&def_root_domain, true);
07051
07052 atomic_set(&def_root_domain.refcount, 1);
07053 }
07054
07055 static struct root_domain *alloc_rootdomain(void)
07056 {
07057 struct root_domain *rd;
07058
07059 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
07060 if (!rd)
07061 return NULL;
07062
07063 if (init_rootdomain(rd, false) != 0) {
07064 kfree(rd);
07065 return NULL;
07066 }
07067
07068 return rd;
07069 }
07070
07071
07072
07073
07074
07075 static void
07076 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
07077 {
07078 struct rq *rq = cpu_rq(cpu);
07079 struct sched_domain *tmp;
07080
07081
07082 for (tmp = sd; tmp; ) {
07083 struct sched_domain *parent = tmp->parent;
07084 if (!parent)
07085 break;
07086
07087 if (sd_parent_degenerate(tmp, parent)) {
07088 tmp->parent = parent->parent;
07089 if (parent->parent)
07090 parent->parent->child = tmp;
07091 } else
07092 tmp = tmp->parent;
07093 }
07094
07095 if (sd && sd_degenerate(sd)) {
07096 sd = sd->parent;
07097 if (sd)
07098 sd->child = NULL;
07099 }
07100
07101 sched_domain_debug(sd, cpu);
07102
07103 rq_attach_root(rq, rd);
07104 rcu_assign_pointer(rq->sd, sd);
07105 }
07106
07107
07108 static cpumask_var_t cpu_isolated_map;
07109
07110
07111 static int __init isolated_cpu_setup(char *str)
07112 {
07113 cpulist_parse(str, cpu_isolated_map);
07114 return 1;
07115 }
07116
07117 __setup("isolcpus=", isolated_cpu_setup);
07118
07119
07120
07121
07122
07123
07124
07125
07126
07127
07128
07129 static void
07130 init_sched_build_groups(const struct cpumask *span,
07131 const struct cpumask *cpu_map,
07132 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
07133 struct sched_group **sg,
07134 struct cpumask *tmpmask),
07135 struct cpumask *covered, struct cpumask *tmpmask)
07136 {
07137 struct sched_group *first = NULL, *last = NULL;
07138 int i;
07139
07140 cpumask_clear(covered);
07141
07142 for_each_cpu(i, span) {
07143 struct sched_group *sg;
07144 int group = group_fn(i, cpu_map, &sg, tmpmask);
07145 int j;
07146
07147 if (cpumask_test_cpu(i, covered))
07148 continue;
07149
07150 cpumask_clear(sched_group_cpus(sg));
07151 sg->__cpu_power = 0;
07152
07153 for_each_cpu(j, span) {
07154 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
07155 continue;
07156
07157 cpumask_set_cpu(j, covered);
07158 cpumask_set_cpu(j, sched_group_cpus(sg));
07159 }
07160 if (!first)
07161 first = sg;
07162 if (last)
07163 last->next = sg;
07164 last = sg;
07165 }
07166 last->next = first;
07167 }
07168
07169 #define SD_NODES_PER_DOMAIN 16
07170
07171 #ifdef CONFIG_NUMA
07172
07173
07174
07175
07176
07177
07178
07179
07180
07181
07182
07183 static int find_next_best_node(int node, nodemask_t *used_nodes)
07184 {
07185 int i, n, val, min_val, best_node = 0;
07186
07187 min_val = INT_MAX;
07188
07189 for (i = 0; i < nr_node_ids; i++) {
07190
07191 n = (node + i) % nr_node_ids;
07192
07193 if (!nr_cpus_node(n))
07194 continue;
07195
07196
07197 if (node_isset(n, *used_nodes))
07198 continue;
07199
07200
07201 val = node_distance(node, n);
07202
07203 if (val < min_val) {
07204 min_val = val;
07205 best_node = n;
07206 }
07207 }
07208
07209 node_set(best_node, *used_nodes);
07210 return best_node;
07211 }
07212
07213
07214
07215
07216
07217
07218
07219
07220
07221
07222 static void sched_domain_node_span(int node, struct cpumask *span)
07223 {
07224 nodemask_t used_nodes;
07225 int i;
07226
07227 cpumask_clear(span);
07228 nodes_clear(used_nodes);
07229
07230 cpumask_or(span, span, cpumask_of_node(node));
07231 node_set(node, used_nodes);
07232
07233 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
07234 int next_node = find_next_best_node(node, &used_nodes);
07235
07236 cpumask_or(span, span, cpumask_of_node(next_node));
07237 }
07238 }
07239 #endif
07240
07241 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
07242
07243
07244
07245
07246
07247
07248 struct static_sched_group {
07249 struct sched_group sg;
07250 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
07251 };
07252
07253 struct static_sched_domain {
07254 struct sched_domain sd;
07255 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
07256 };
07257
07258
07259
07260
07261 #ifdef CONFIG_SCHED_SMT
07262 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
07263 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
07264
07265 static int
07266 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
07267 struct sched_group **sg, struct cpumask *unused)
07268 {
07269 if (sg)
07270 *sg = &per_cpu(sched_group_cpus, cpu).sg;
07271 return cpu;
07272 }
07273 #endif
07274
07275
07276
07277
07278 #ifdef CONFIG_SCHED_MC
07279 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
07280 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
07281 #endif
07282
07283 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
07284 static int
07285 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
07286 struct sched_group **sg, struct cpumask *mask)
07287 {
07288 int group;
07289
07290 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
07291 group = cpumask_first(mask);
07292 if (sg)
07293 *sg = &per_cpu(sched_group_core, group).sg;
07294 return group;
07295 }
07296 #elif defined(CONFIG_SCHED_MC)
07297 static int
07298 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
07299 struct sched_group **sg, struct cpumask *unused)
07300 {
07301 if (sg)
07302 *sg = &per_cpu(sched_group_core, cpu).sg;
07303 return cpu;
07304 }
07305 #endif
07306
07307 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
07308 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
07309
07310 static int
07311 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
07312 struct sched_group **sg, struct cpumask *mask)
07313 {
07314 int group;
07315 #ifdef CONFIG_SCHED_MC
07316 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
07317 group = cpumask_first(mask);
07318 #elif defined(CONFIG_SCHED_SMT)
07319 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
07320 group = cpumask_first(mask);
07321 #else
07322 group = cpu;
07323 #endif
07324 if (sg)
07325 *sg = &per_cpu(sched_group_phys, group).sg;
07326 return group;
07327 }
07328
07329 #ifdef CONFIG_NUMA
07330
07331
07332
07333
07334
07335 static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
07336 static struct sched_group ***sched_group_nodes_bycpu;
07337
07338 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
07339 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
07340
07341 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
07342 struct sched_group **sg,
07343 struct cpumask *nodemask)
07344 {
07345 int group;
07346
07347 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
07348 group = cpumask_first(nodemask);
07349
07350 if (sg)
07351 *sg = &per_cpu(sched_group_allnodes, group).sg;
07352 return group;
07353 }
07354
07355 static void init_numa_sched_groups_power(struct sched_group *group_head)
07356 {
07357 struct sched_group *sg = group_head;
07358 int j;
07359
07360 if (!sg)
07361 return;
07362 do {
07363 for_each_cpu(j, sched_group_cpus(sg)) {
07364 struct sched_domain *sd;
07365
07366 sd = &per_cpu(phys_domains, j).sd;
07367 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
07368
07369
07370
07371
07372 continue;
07373 }
07374
07375 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
07376 }
07377 sg = sg->next;
07378 } while (sg != group_head);
07379 }
07380 #endif
07381
07382 #ifdef CONFIG_NUMA
07383
07384 static void free_sched_groups(const struct cpumask *cpu_map,
07385 struct cpumask *nodemask)
07386 {
07387 int cpu, i;
07388
07389 for_each_cpu(cpu, cpu_map) {
07390 struct sched_group **sched_group_nodes
07391 = sched_group_nodes_bycpu[cpu];
07392
07393 if (!sched_group_nodes)
07394 continue;
07395
07396 for (i = 0; i < nr_node_ids; i++) {
07397 struct sched_group *oldsg, *sg = sched_group_nodes[i];
07398
07399 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
07400 if (cpumask_empty(nodemask))
07401 continue;
07402
07403 if (sg == NULL)
07404 continue;
07405 sg = sg->next;
07406 next_sg:
07407 oldsg = sg;
07408 sg = sg->next;
07409 kfree(oldsg);
07410 if (oldsg != sched_group_nodes[i])
07411 goto next_sg;
07412 }
07413 kfree(sched_group_nodes);
07414 sched_group_nodes_bycpu[cpu] = NULL;
07415 }
07416 }
07417 #else
07418 static void free_sched_groups(const struct cpumask *cpu_map,
07419 struct cpumask *nodemask)
07420 {
07421 }
07422 #endif
07423
07424
07425
07426
07427
07428
07429
07430
07431
07432
07433
07434
07435
07436
07437
07438 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
07439 {
07440 struct sched_domain *child;
07441 struct sched_group *group;
07442
07443 WARN_ON(!sd || !sd->groups);
07444
07445 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
07446 return;
07447
07448 child = sd->child;
07449
07450 sd->groups->__cpu_power = 0;
07451
07452
07453
07454
07455
07456
07457
07458
07459 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
07460 (child->flags &
07461 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
07462 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
07463 return;
07464 }
07465
07466
07467
07468
07469 group = child->groups;
07470 do {
07471 sg_inc_cpu_power(sd->groups, group->__cpu_power);
07472 group = group->next;
07473 } while (group != child->groups);
07474 }
07475
07476
07477
07478
07479
07480
07481 #ifdef CONFIG_SCHED_DEBUG
07482 # define SD_INIT_NAME(sd, type) sd->name = #type
07483 #else
07484 # define SD_INIT_NAME(sd, type) do { } while (0)
07485 #endif
07486
07487 #define SD_INIT(sd, type) sd_init_##type(sd)
07488
07489 #define SD_INIT_FUNC(type) \
07490 static noinline void sd_init_##type(struct sched_domain *sd) \
07491 { \
07492 memset(sd, 0, sizeof(*sd)); \
07493 *sd = SD_##type##_INIT; \
07494 sd->level = SD_LV_##type; \
07495 SD_INIT_NAME(sd, type); \
07496 }
07497
07498 SD_INIT_FUNC(CPU)
07499 #ifdef CONFIG_NUMA
07500 SD_INIT_FUNC(ALLNODES)
07501 SD_INIT_FUNC(NODE)
07502 #endif
07503 #ifdef CONFIG_SCHED_SMT
07504 SD_INIT_FUNC(SIBLING)
07505 #endif
07506 #ifdef CONFIG_SCHED_MC
07507 SD_INIT_FUNC(MC)
07508 #endif
07509
07510 static int default_relax_domain_level = -1;
07511
07512 static int __init setup_relax_domain_level(char *str)
07513 {
07514 unsigned long val;
07515
07516 val = simple_strtoul(str, NULL, 0);
07517 if (val < SD_LV_MAX)
07518 default_relax_domain_level = val;
07519
07520 return 1;
07521 }
07522 __setup("relax_domain_level=", setup_relax_domain_level);
07523
07524 static void set_domain_attribute(struct sched_domain *sd,
07525 struct sched_domain_attr *attr)
07526 {
07527 int request;
07528
07529 if (!attr || attr->relax_domain_level < 0) {
07530 if (default_relax_domain_level < 0)
07531 return;
07532 else
07533 request = default_relax_domain_level;
07534 } else
07535 request = attr->relax_domain_level;
07536 if (request < sd->level) {
07537
07538 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
07539 } else {
07540
07541 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
07542 }
07543 }
07544
07545
07546
07547
07548
07549 static int __build_sched_domains(const struct cpumask *cpu_map,
07550 struct sched_domain_attr *attr)
07551 {
07552 int i, err = -ENOMEM;
07553 struct root_domain *rd;
07554 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
07555 tmpmask;
07556 #ifdef CONFIG_NUMA
07557 cpumask_var_t domainspan, covered, notcovered;
07558 struct sched_group **sched_group_nodes = NULL;
07559 int sd_allnodes = 0;
07560
07561 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
07562 goto out;
07563 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
07564 goto free_domainspan;
07565 if (!alloc_cpumask_var(¬covered, GFP_KERNEL))
07566 goto free_covered;
07567 #endif
07568
07569 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
07570 goto free_notcovered;
07571 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
07572 goto free_nodemask;
07573 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
07574 goto free_this_sibling_map;
07575 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
07576 goto free_this_core_map;
07577 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
07578 goto free_send_covered;
07579
07580 #ifdef CONFIG_NUMA
07581
07582
07583
07584 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
07585 GFP_KERNEL);
07586 if (!sched_group_nodes) {
07587 printk(KERN_WARNING "Can not alloc sched group node list\n");
07588 goto free_tmpmask;
07589 }
07590 #endif
07591
07592 rd = alloc_rootdomain();
07593 if (!rd) {
07594 printk(KERN_WARNING "Cannot alloc root domain\n");
07595 goto free_sched_groups;
07596 }
07597
07598 #ifdef CONFIG_NUMA
07599 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
07600 #endif
07601
07602
07603
07604
07605 for_each_cpu(i, cpu_map) {
07606 struct sched_domain *sd = NULL, *p;
07607
07608 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
07609
07610 #ifdef CONFIG_NUMA
07611 if (cpumask_weight(cpu_map) >
07612 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
07613 sd = &per_cpu(allnodes_domains, i).sd;
07614 SD_INIT(sd, ALLNODES);
07615 set_domain_attribute(sd, attr);
07616 cpumask_copy(sched_domain_span(sd), cpu_map);
07617 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
07618 p = sd;
07619 sd_allnodes = 1;
07620 } else
07621 p = NULL;
07622
07623 sd = &per_cpu(node_domains, i).sd;
07624 SD_INIT(sd, NODE);
07625 set_domain_attribute(sd, attr);
07626 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
07627 sd->parent = p;
07628 if (p)
07629 p->child = sd;
07630 cpumask_and(sched_domain_span(sd),
07631 sched_domain_span(sd), cpu_map);
07632 #endif
07633
07634 p = sd;
07635 sd = &per_cpu(phys_domains, i).sd;
07636 SD_INIT(sd, CPU);
07637 set_domain_attribute(sd, attr);
07638 cpumask_copy(sched_domain_span(sd), nodemask);
07639 sd->parent = p;
07640 if (p)
07641 p->child = sd;
07642 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
07643
07644 #ifdef CONFIG_SCHED_MC
07645 p = sd;
07646 sd = &per_cpu(core_domains, i).sd;
07647 SD_INIT(sd, MC);
07648 set_domain_attribute(sd, attr);
07649 cpumask_and(sched_domain_span(sd), cpu_map,
07650 cpu_coregroup_mask(i));
07651 sd->parent = p;
07652 p->child = sd;
07653 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
07654 #endif
07655
07656 #ifdef CONFIG_SCHED_SMT
07657 p = sd;
07658 sd = &per_cpu(cpu_domains, i).sd;
07659 SD_INIT(sd, SIBLING);
07660 set_domain_attribute(sd, attr);
07661 cpumask_and(sched_domain_span(sd),
07662 &per_cpu(cpu_sibling_map, i), cpu_map);
07663 sd->parent = p;
07664 p->child = sd;
07665 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
07666 #endif
07667 }
07668
07669 #ifdef CONFIG_SCHED_SMT
07670
07671 for_each_cpu(i, cpu_map) {
07672 cpumask_and(this_sibling_map,
07673 &per_cpu(cpu_sibling_map, i), cpu_map);
07674 if (i != cpumask_first(this_sibling_map))
07675 continue;
07676
07677 init_sched_build_groups(this_sibling_map, cpu_map,
07678 &cpu_to_cpu_group,
07679 send_covered, tmpmask);
07680 }
07681 #endif
07682
07683 #ifdef CONFIG_SCHED_MC
07684
07685 for_each_cpu(i, cpu_map) {
07686 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
07687 if (i != cpumask_first(this_core_map))
07688 continue;
07689
07690 init_sched_build_groups(this_core_map, cpu_map,
07691 &cpu_to_core_group,
07692 send_covered, tmpmask);
07693 }
07694 #endif
07695
07696
07697 for (i = 0; i < nr_node_ids; i++) {
07698 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
07699 if (cpumask_empty(nodemask))
07700 continue;
07701
07702 init_sched_build_groups(nodemask, cpu_map,
07703 &cpu_to_phys_group,
07704 send_covered, tmpmask);
07705 }
07706
07707 #ifdef CONFIG_NUMA
07708
07709 if (sd_allnodes) {
07710 init_sched_build_groups(cpu_map, cpu_map,
07711 &cpu_to_allnodes_group,
07712 send_covered, tmpmask);
07713 }
07714
07715 for (i = 0; i < nr_node_ids; i++) {
07716
07717 struct sched_group *sg, *prev;
07718 int j;
07719
07720 cpumask_clear(covered);
07721 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
07722 if (cpumask_empty(nodemask)) {
07723 sched_group_nodes[i] = NULL;
07724 continue;
07725 }
07726
07727 sched_domain_node_span(i, domainspan);
07728 cpumask_and(domainspan, domainspan, cpu_map);
07729
07730 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
07731 GFP_KERNEL, i);
07732 if (!sg) {
07733 printk(KERN_WARNING "Can not alloc domain group for "
07734 "node %d\n", i);
07735 goto error;
07736 }
07737 sched_group_nodes[i] = sg;
07738 for_each_cpu(j, nodemask) {
07739 struct sched_domain *sd;
07740
07741 sd = &per_cpu(node_domains, j).sd;
07742 sd->groups = sg;
07743 }
07744 sg->__cpu_power = 0;
07745 cpumask_copy(sched_group_cpus(sg), nodemask);
07746 sg->next = sg;
07747 cpumask_or(covered, covered, nodemask);
07748 prev = sg;
07749
07750 for (j = 0; j < nr_node_ids; j++) {
07751 int n = (i + j) % nr_node_ids;
07752
07753 cpumask_complement(notcovered, covered);
07754 cpumask_and(tmpmask, notcovered, cpu_map);
07755 cpumask_and(tmpmask, tmpmask, domainspan);
07756 if (cpumask_empty(tmpmask))
07757 break;
07758
07759 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
07760 if (cpumask_empty(tmpmask))
07761 continue;
07762
07763 sg = kmalloc_node(sizeof(struct sched_group) +
07764 cpumask_size(),
07765 GFP_KERNEL, i);
07766 if (!sg) {
07767 printk(KERN_WARNING
07768 "Can not alloc domain group for node %d\n", j);
07769 goto error;
07770 }
07771 sg->__cpu_power = 0;
07772 cpumask_copy(sched_group_cpus(sg), tmpmask);
07773 sg->next = prev->next;
07774 cpumask_or(covered, covered, tmpmask);
07775 prev->next = sg;
07776 prev = sg;
07777 }
07778 }
07779 #endif
07780
07781
07782 #ifdef CONFIG_SCHED_SMT
07783 for_each_cpu(i, cpu_map) {
07784 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
07785
07786 init_sched_groups_power(i, sd);
07787 }
07788 #endif
07789 #ifdef CONFIG_SCHED_MC
07790 for_each_cpu(i, cpu_map) {
07791 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
07792
07793 init_sched_groups_power(i, sd);
07794 }
07795 #endif
07796
07797 for_each_cpu(i, cpu_map) {
07798 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
07799
07800 init_sched_groups_power(i, sd);
07801 }
07802
07803 #ifdef CONFIG_NUMA
07804 for (i = 0; i < nr_node_ids; i++)
07805 init_numa_sched_groups_power(sched_group_nodes[i]);
07806
07807 if (sd_allnodes) {
07808 struct sched_group *sg;
07809
07810 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
07811 tmpmask);
07812 init_numa_sched_groups_power(sg);
07813 }
07814 #endif
07815
07816
07817 for_each_cpu(i, cpu_map) {
07818 struct sched_domain *sd;
07819 #ifdef CONFIG_SCHED_SMT
07820 sd = &per_cpu(cpu_domains, i).sd;
07821 #elif defined(CONFIG_SCHED_MC)
07822 sd = &per_cpu(core_domains, i).sd;
07823 #else
07824 sd = &per_cpu(phys_domains, i).sd;
07825 #endif
07826 cpu_attach_domain(sd, rd, i);
07827 }
07828
07829 err = 0;
07830
07831 free_tmpmask:
07832 free_cpumask_var(tmpmask);
07833 free_send_covered:
07834 free_cpumask_var(send_covered);
07835 free_this_core_map:
07836 free_cpumask_var(this_core_map);
07837 free_this_sibling_map:
07838 free_cpumask_var(this_sibling_map);
07839 free_nodemask:
07840 free_cpumask_var(nodemask);
07841 free_notcovered:
07842 #ifdef CONFIG_NUMA
07843 free_cpumask_var(notcovered);
07844 free_covered:
07845 free_cpumask_var(covered);
07846 free_domainspan:
07847 free_cpumask_var(domainspan);
07848 out:
07849 #endif
07850 return err;
07851
07852 free_sched_groups:
07853 #ifdef CONFIG_NUMA
07854 kfree(sched_group_nodes);
07855 #endif
07856 goto free_tmpmask;
07857
07858 #ifdef CONFIG_NUMA
07859 error:
07860 free_sched_groups(cpu_map, tmpmask);
07861 free_rootdomain(rd);
07862 goto free_tmpmask;
07863 #endif
07864 }
07865
07866 static int build_sched_domains(const struct cpumask *cpu_map)
07867 {
07868 return __build_sched_domains(cpu_map, NULL);
07869 }
07870
07871 static struct cpumask *doms_cur;
07872 static int ndoms_cur;
07873 static struct sched_domain_attr *dattr_cur;
07874
07875
07876
07877
07878
07879
07880
07881 static cpumask_var_t fallback_doms;
07882
07883
07884
07885
07886
07887
07888 int __attribute__((weak)) arch_update_cpu_topology(void)
07889 {
07890 return 0;
07891 }
07892
07893
07894
07895
07896
07897
07898 static int arch_init_sched_domains(const struct cpumask *cpu_map)
07899 {
07900 int err;
07901
07902 arch_update_cpu_topology();
07903 ndoms_cur = 1;
07904 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
07905 if (!doms_cur)
07906 doms_cur = fallback_doms;
07907 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
07908 dattr_cur = NULL;
07909 err = build_sched_domains(doms_cur);
07910 register_sched_domain_sysctl();
07911
07912 return err;
07913 }
07914
07915 static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
07916 struct cpumask *tmpmask)
07917 {
07918 free_sched_groups(cpu_map, tmpmask);
07919 }
07920
07921
07922
07923
07924
07925 static void detach_destroy_domains(const struct cpumask *cpu_map)
07926 {
07927
07928 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
07929 int i;
07930
07931 for_each_cpu(i, cpu_map)
07932 cpu_attach_domain(NULL, &def_root_domain, i);
07933 synchronize_sched();
07934 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
07935 }
07936
07937
07938 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
07939 struct sched_domain_attr *new, int idx_new)
07940 {
07941 struct sched_domain_attr tmp;
07942
07943
07944 if (!new && !cur)
07945 return 1;
07946
07947 tmp = SD_ATTR_INIT;
07948 return !memcmp(cur ? (cur + idx_cur) : &tmp,
07949 new ? (new + idx_new) : &tmp,
07950 sizeof(struct sched_domain_attr));
07951 }
07952
07953
07954
07955
07956
07957
07958
07959
07960
07961
07962
07963
07964
07965
07966
07967
07968
07969
07970
07971
07972
07973
07974
07975
07976
07977
07978
07979
07980 void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
07981 struct sched_domain_attr *dattr_new)
07982 {
07983 int i, j, n;
07984 int new_topology;
07985
07986 mutex_lock(&sched_domains_mutex);
07987
07988
07989 unregister_sched_domain_sysctl();
07990
07991
07992 new_topology = arch_update_cpu_topology();
07993
07994 n = doms_new ? ndoms_new : 0;
07995
07996
07997 for (i = 0; i < ndoms_cur; i++) {
07998 for (j = 0; j < n && !new_topology; j++) {
07999 if (cpumask_equal(&doms_cur[i], &doms_new[j])
08000 && dattrs_equal(dattr_cur, i, dattr_new, j))
08001 goto match1;
08002 }
08003
08004 detach_destroy_domains(doms_cur + i);
08005 match1:
08006 ;
08007 }
08008
08009 if (doms_new == NULL) {
08010 ndoms_cur = 0;
08011 doms_new = fallback_doms;
08012 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
08013 WARN_ON_ONCE(dattr_new);
08014 }
08015
08016
08017 for (i = 0; i < ndoms_new; i++) {
08018 for (j = 0; j < ndoms_cur && !new_topology; j++) {
08019 if (cpumask_equal(&doms_new[i], &doms_cur[j])
08020 && dattrs_equal(dattr_new, i, dattr_cur, j))
08021 goto match2;
08022 }
08023
08024 __build_sched_domains(doms_new + i,
08025 dattr_new ? dattr_new + i : NULL);
08026 match2:
08027 ;
08028 }
08029
08030
08031 if (doms_cur != fallback_doms)
08032 kfree(doms_cur);
08033 kfree(dattr_cur);
08034 doms_cur = doms_new;
08035 dattr_cur = dattr_new;
08036 ndoms_cur = ndoms_new;
08037
08038 register_sched_domain_sysctl();
08039
08040 mutex_unlock(&sched_domains_mutex);
08041 }
08042
08043 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
08044 static void arch_reinit_sched_domains(void)
08045 {
08046 get_online_cpus();
08047
08048
08049 partition_sched_domains(0, NULL, NULL);
08050
08051 rebuild_sched_domains();
08052 put_online_cpus();
08053 }
08054
08055 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
08056 {
08057 unsigned int level = 0;
08058
08059 if (sscanf(buf, "%u", &level) != 1)
08060 return -EINVAL;
08061
08062
08063
08064
08065
08066
08067
08068
08069 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
08070 return -EINVAL;
08071
08072 if (smt)
08073 sched_smt_power_savings = level;
08074 else
08075 sched_mc_power_savings = level;
08076
08077 arch_reinit_sched_domains();
08078
08079 return count;
08080 }
08081
08082 #ifdef CONFIG_SCHED_MC
08083 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
08084 char *page)
08085 {
08086 return sprintf(page, "%u\n", sched_mc_power_savings);
08087 }
08088 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
08089 const char *buf, size_t count)
08090 {
08091 return sched_power_savings_store(buf, count, 0);
08092 }
08093 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
08094 sched_mc_power_savings_show,
08095 sched_mc_power_savings_store);
08096 #endif
08097
08098 #ifdef CONFIG_SCHED_SMT
08099 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
08100 char *page)
08101 {
08102 return sprintf(page, "%u\n", sched_smt_power_savings);
08103 }
08104 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
08105 const char *buf, size_t count)
08106 {
08107 return sched_power_savings_store(buf, count, 1);
08108 }
08109 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
08110 sched_smt_power_savings_show,
08111 sched_smt_power_savings_store);
08112 #endif
08113
08114 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
08115 {
08116 int err = 0;
08117
08118 #ifdef CONFIG_SCHED_SMT
08119 if (smt_capable())
08120 err = sysfs_create_file(&cls->kset.kobj,
08121 &attr_sched_smt_power_savings.attr);
08122 #endif
08123 #ifdef CONFIG_SCHED_MC
08124 if (!err && mc_capable())
08125 err = sysfs_create_file(&cls->kset.kobj,
08126 &attr_sched_mc_power_savings.attr);
08127 #endif
08128 return err;
08129 }
08130 #endif
08131
08132 #ifndef CONFIG_CPUSETS
08133
08134
08135
08136
08137 static int update_sched_domains(struct notifier_block *nfb,
08138 unsigned long action, void *hcpu)
08139 {
08140 switch (action) {
08141 case CPU_ONLINE:
08142 case CPU_ONLINE_FROZEN:
08143 case CPU_DEAD:
08144 case CPU_DEAD_FROZEN:
08145 partition_sched_domains(1, NULL, NULL);
08146 return NOTIFY_OK;
08147
08148 default:
08149 return NOTIFY_DONE;
08150 }
08151 }
08152 #endif
08153
08154 static int update_runtime(struct notifier_block *nfb,
08155 unsigned long action, void *hcpu)
08156 {
08157 int cpu = (int)(long)hcpu;
08158
08159 switch (action) {
08160 case CPU_DOWN_PREPARE:
08161 case CPU_DOWN_PREPARE_FROZEN:
08162 disable_runtime(cpu_rq(cpu));
08163 return NOTIFY_OK;
08164
08165 case CPU_DOWN_FAILED:
08166 case CPU_DOWN_FAILED_FROZEN:
08167 case CPU_ONLINE:
08168 case CPU_ONLINE_FROZEN:
08169 enable_runtime(cpu_rq(cpu));
08170 return NOTIFY_OK;
08171
08172 default:
08173 return NOTIFY_DONE;
08174 }
08175 }
08176
08177 void __init sched_init_smp(void)
08178 {
08179 cpumask_var_t non_isolated_cpus;
08180
08181 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
08182
08183 #if defined(CONFIG_NUMA)
08184 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
08185 GFP_KERNEL);
08186 BUG_ON(sched_group_nodes_bycpu == NULL);
08187 #endif
08188 get_online_cpus();
08189 mutex_lock(&sched_domains_mutex);
08190 arch_init_sched_domains(cpu_online_mask);
08191 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
08192 if (cpumask_empty(non_isolated_cpus))
08193 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
08194 mutex_unlock(&sched_domains_mutex);
08195 put_online_cpus();
08196
08197 #ifndef CONFIG_CPUSETS
08198
08199 hotcpu_notifier(update_sched_domains, 0);
08200 #endif
08201
08202
08203 hotcpu_notifier(update_runtime, 0);
08204
08205 init_hrtick();
08206
08207
08208 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
08209 BUG();
08210 sched_init_granularity();
08211 free_cpumask_var(non_isolated_cpus);
08212
08213 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
08214 init_sched_rt_class();
08215 }
08216 #else
08217 void __init sched_init_smp(void)
08218 {
08219 sched_init_granularity();
08220 }
08221 #endif
08222
08223 int in_sched_functions(unsigned long addr)
08224 {
08225 return in_lock_functions(addr) ||
08226 (addr >= (unsigned long)__sched_text_start
08227 && addr < (unsigned long)__sched_text_end);
08228 }
08229
08230 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
08231 {
08232 cfs_rq->tasks_timeline = RB_ROOT;
08233 INIT_LIST_HEAD(&cfs_rq->tasks);
08234 #ifdef CONFIG_FAIR_GROUP_SCHED
08235 cfs_rq->rq = rq;
08236 #endif
08237 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
08238 }
08239
08240 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
08241 {
08242 struct rt_prio_array *array;
08243 int i;
08244
08245 array = &rt_rq->active;
08246 for (i = 0; i < MAX_RT_PRIO; i++) {
08247 INIT_LIST_HEAD(array->queue + i);
08248 __clear_bit(i, array->bitmap);
08249 }
08250
08251 __set_bit(MAX_RT_PRIO, array->bitmap);
08252
08253 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
08254 rt_rq->highest_prio = MAX_RT_PRIO;
08255 #endif
08256 #ifdef CONFIG_SMP
08257 rt_rq->rt_nr_migratory = 0;
08258 rt_rq->overloaded = 0;
08259 #endif
08260
08261 rt_rq->rt_time = 0;
08262 rt_rq->rt_throttled = 0;
08263 rt_rq->rt_runtime = 0;
08264 spin_lock_init(&rt_rq->rt_runtime_lock);
08265
08266 #ifdef CONFIG_RT_GROUP_SCHED
08267 rt_rq->rt_nr_boosted = 0;
08268 rt_rq->rq = rq;
08269 #endif
08270 }
08271
08272 #ifdef CONFIG_FAIR_GROUP_SCHED
08273 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
08274 struct sched_entity *se, int cpu, int add,
08275 struct sched_entity *parent)
08276 {
08277 struct rq *rq = cpu_rq(cpu);
08278 tg->cfs_rq[cpu] = cfs_rq;
08279 init_cfs_rq(cfs_rq, rq);
08280 cfs_rq->tg = tg;
08281 if (add)
08282 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
08283
08284 tg->se[cpu] = se;
08285
08286 if (!se)
08287 return;
08288
08289 if (!parent)
08290 se->cfs_rq = &rq->cfs;
08291 else
08292 se->cfs_rq = parent->my_q;
08293
08294 se->my_q = cfs_rq;
08295 se->load.weight = tg->shares;
08296 se->load.inv_weight = 0;
08297 se->parent = parent;
08298 }
08299 #endif
08300
08301 #ifdef CONFIG_RT_GROUP_SCHED
08302 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
08303 struct sched_rt_entity *rt_se, int cpu, int add,
08304 struct sched_rt_entity *parent)
08305 {
08306 struct rq *rq = cpu_rq(cpu);
08307
08308 tg->rt_rq[cpu] = rt_rq;
08309 init_rt_rq(rt_rq, rq);
08310 rt_rq->tg = tg;
08311 rt_rq->rt_se = rt_se;
08312 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
08313 if (add)
08314 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
08315
08316 tg->rt_se[cpu] = rt_se;
08317 if (!rt_se)
08318 return;
08319
08320 if (!parent)
08321 rt_se->rt_rq = &rq->rt;
08322 else
08323 rt_se->rt_rq = parent->my_q;
08324
08325 rt_se->my_q = rt_rq;
08326 rt_se->parent = parent;
08327 INIT_LIST_HEAD(&rt_se->run_list);
08328 }
08329 #endif
08330
08331 void __init sched_init(void)
08332 {
08333 int i, j;
08334 unsigned long alloc_size = 0, ptr;
08335
08336 #ifdef CONFIG_FAIR_GROUP_SCHED
08337 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
08338 #endif
08339 #ifdef CONFIG_RT_GROUP_SCHED
08340 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
08341 #endif
08342 #ifdef CONFIG_USER_SCHED
08343 alloc_size *= 2;
08344 #endif
08345
08346
08347
08348
08349 if (alloc_size) {
08350 ptr = (unsigned long)alloc_bootmem(alloc_size);
08351
08352 #ifdef CONFIG_FAIR_GROUP_SCHED
08353 init_task_group.se = (struct sched_entity **)ptr;
08354 ptr += nr_cpu_ids * sizeof(void **);
08355
08356 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
08357 ptr += nr_cpu_ids * sizeof(void **);
08358
08359 #ifdef CONFIG_USER_SCHED
08360 root_task_group.se = (struct sched_entity **)ptr;
08361 ptr += nr_cpu_ids * sizeof(void **);
08362
08363 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
08364 ptr += nr_cpu_ids * sizeof(void **);
08365 #endif
08366 #endif
08367 #ifdef CONFIG_RT_GROUP_SCHED
08368 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
08369 ptr += nr_cpu_ids * sizeof(void **);
08370
08371 init_task_group.rt_rq = (struct rt_rq **)ptr;
08372 ptr += nr_cpu_ids * sizeof(void **);
08373
08374 #ifdef CONFIG_USER_SCHED
08375 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
08376 ptr += nr_cpu_ids * sizeof(void **);
08377
08378 root_task_group.rt_rq = (struct rt_rq **)ptr;
08379 ptr += nr_cpu_ids * sizeof(void **);
08380 #endif
08381 #endif
08382 }
08383
08384 #ifdef CONFIG_SMP
08385 init_defrootdomain();
08386 #endif
08387
08388 init_rt_bandwidth(&def_rt_bandwidth,
08389 global_rt_period(), global_rt_runtime());
08390
08391 #ifdef CONFIG_RT_GROUP_SCHED
08392 init_rt_bandwidth(&init_task_group.rt_bandwidth,
08393 global_rt_period(), global_rt_runtime());
08394 #ifdef CONFIG_USER_SCHED
08395 init_rt_bandwidth(&root_task_group.rt_bandwidth,
08396 global_rt_period(), RUNTIME_INF);
08397 #endif
08398 #endif
08399
08400 #ifdef CONFIG_GROUP_SCHED
08401 list_add(&init_task_group.list, &task_groups);
08402 INIT_LIST_HEAD(&init_task_group.children);
08403
08404 #ifdef CONFIG_USER_SCHED
08405 INIT_LIST_HEAD(&root_task_group.children);
08406 init_task_group.parent = &root_task_group;
08407 list_add(&init_task_group.siblings, &root_task_group.children);
08408 #endif
08409 #endif
08410
08411 for_each_possible_cpu(i) {
08412 struct rq *rq;
08413
08414 rq = cpu_rq(i);
08415 spin_lock_init(&rq->lock);
08416 rq->nr_running = 0;
08417 init_cfs_rq(&rq->cfs, rq);
08418 init_rt_rq(&rq->rt, rq);
08419 #ifdef CONFIG_FAIR_GROUP_SCHED
08420 init_task_group.shares = init_task_group_load;
08421 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
08422 #ifdef CONFIG_CGROUP_SCHED
08423
08424
08425
08426
08427
08428
08429
08430
08431
08432
08433
08434
08435
08436
08437
08438
08439
08440
08441
08442 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
08443 #elif defined CONFIG_USER_SCHED
08444 root_task_group.shares = NICE_0_LOAD;
08445 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
08446
08447
08448
08449
08450
08451
08452
08453
08454
08455
08456
08457 init_tg_cfs_entry(&init_task_group,
08458 &per_cpu(init_cfs_rq, i),
08459 &per_cpu(init_sched_entity, i), i, 1,
08460 root_task_group.se[i]);
08461
08462 #endif
08463 #endif
08464
08465 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
08466 #ifdef CONFIG_RT_GROUP_SCHED
08467 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
08468 #ifdef CONFIG_CGROUP_SCHED
08469 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
08470 #elif defined CONFIG_USER_SCHED
08471 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
08472 init_tg_rt_entry(&init_task_group,
08473 &per_cpu(init_rt_rq, i),
08474 &per_cpu(init_sched_rt_entity, i), i, 1,
08475 root_task_group.rt_se[i]);
08476 #endif
08477 #endif
08478
08479 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
08480 rq->cpu_load[j] = 0;
08481 #ifdef CONFIG_SMP
08482 rq->sd = NULL;
08483 rq->rd = NULL;
08484 rq->active_balance = 0;
08485 rq->next_balance = jiffies;
08486 rq->push_cpu = 0;
08487 rq->cpu = i;
08488 rq->online = 0;
08489 rq->migration_thread = NULL;
08490 INIT_LIST_HEAD(&rq->migration_queue);
08491 rq_attach_root(rq, &def_root_domain);
08492 #endif
08493 init_rq_hrtick(rq);
08494 atomic_set(&rq->nr_iowait, 0);
08495 }
08496
08497 set_load_weight(&init_task);
08498
08499 #ifdef CONFIG_PREEMPT_NOTIFIERS
08500 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
08501 #endif
08502
08503 #ifdef CONFIG_SMP
08504 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
08505 #endif
08506
08507 #ifdef CONFIG_RT_MUTEXES
08508 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
08509 #endif
08510
08511
08512
08513
08514 atomic_inc(&init_mm.mm_count);
08515 enter_lazy_tlb(&init_mm, current);
08516
08517
08518
08519
08520
08521
08522
08523 init_idle(current, smp_processor_id());
08524
08525
08526
08527 current->sched_class = &fair_sched_class;
08528
08529
08530 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
08531 #ifdef CONFIG_SMP
08532 #ifdef CONFIG_NO_HZ
08533 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
08534 #endif
08535 alloc_bootmem_cpumask_var(&cpu_isolated_map);
08536 #endif
08537
08538 scheduler_running = 1;
08539 }
08540
08541 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
08542 void __might_sleep(char *file, int line)
08543 {
08544 #ifdef in_atomic
08545 static unsigned long prev_jiffy;
08546
08547 if ((!in_atomic() && !irqs_disabled()) ||
08548 system_state != SYSTEM_RUNNING || oops_in_progress)
08549 return;
08550 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
08551 return;
08552 prev_jiffy = jiffies;
08553
08554 printk(KERN_ERR
08555 "BUG: sleeping function called from invalid context at %s:%d\n",
08556 file, line);
08557 printk(KERN_ERR
08558 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
08559 in_atomic(), irqs_disabled(),
08560 current->pid, current->comm);
08561
08562 debug_show_held_locks(current);
08563 if (irqs_disabled())
08564 print_irqtrace_events(current);
08565 dump_stack();
08566 #endif
08567 }
08568 EXPORT_SYMBOL(__might_sleep);
08569 #endif
08570
08571 #ifdef CONFIG_MAGIC_SYSRQ
08572 static void normalize_task(struct rq *rq, struct task_struct *p)
08573 {
08574 int on_rq;
08575
08576 update_rq_clock(rq);
08577 on_rq = p->se.on_rq;
08578 if (on_rq)
08579 deactivate_task(rq, p, 0);
08580 __setscheduler(rq, p, SCHED_NORMAL, 0);
08581 if (on_rq) {
08582 activate_task(rq, p, 0);
08583 resched_task(rq->curr);
08584 }
08585 }
08586
08587 void normalize_rt_tasks(void)
08588 {
08589 struct task_struct *g, *p;
08590 unsigned long flags;
08591 struct rq *rq;
08592
08593 read_lock_irqsave(&tasklist_lock, flags);
08594 do_each_thread(g, p) {
08595
08596
08597
08598 if (!p->mm)
08599 continue;
08600
08601 p->se.exec_start = 0;
08602 #ifdef CONFIG_SCHEDSTATS
08603 p->se.wait_start = 0;
08604 p->se.sleep_start = 0;
08605 p->se.block_start = 0;
08606 #endif
08607
08608 if (!rt_task(p)) {
08609
08610
08611
08612
08613 if (TASK_NICE(p) < 0 && p->mm)
08614 set_user_nice(p, 0);
08615 continue;
08616 }
08617
08618 spin_lock(&p->pi_lock);
08619 rq = __task_rq_lock(p);
08620
08621 normalize_task(rq, p);
08622
08623 __task_rq_unlock(rq);
08624 spin_unlock(&p->pi_lock);
08625 } while_each_thread(g, p);
08626
08627 read_unlock_irqrestore(&tasklist_lock, flags);
08628 }
08629
08630 #endif
08631
08632 #ifdef CONFIG_IA64
08633
08634
08635
08636
08637
08638
08639
08640
08641
08642
08643
08644
08645
08646
08647
08648
08649 struct task_struct *curr_task(int cpu)
08650 {
08651 return cpu_curr(cpu);
08652 }
08653
08654
08655
08656
08657
08658
08659
08660
08661
08662
08663
08664
08665
08666
08667
08668
08669 void set_curr_task(int cpu, struct task_struct *p)
08670 {
08671 cpu_curr(cpu) = p;
08672 }
08673
08674 #endif
08675
08676 #ifdef CONFIG_FAIR_GROUP_SCHED
08677 static void free_fair_sched_group(struct task_group *tg)
08678 {
08679 int i;
08680
08681 for_each_possible_cpu(i) {
08682 if (tg->cfs_rq)
08683 kfree(tg->cfs_rq[i]);
08684 if (tg->se)
08685 kfree(tg->se[i]);
08686 }
08687
08688 kfree(tg->cfs_rq);
08689 kfree(tg->se);
08690 }
08691
08692 static
08693 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
08694 {
08695 struct cfs_rq *cfs_rq;
08696 struct sched_entity *se;
08697 struct rq *rq;
08698 int i;
08699
08700 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
08701 if (!tg->cfs_rq)
08702 goto err;
08703 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
08704 if (!tg->se)
08705 goto err;
08706
08707 tg->shares = NICE_0_LOAD;
08708
08709 for_each_possible_cpu(i) {
08710 rq = cpu_rq(i);
08711
08712 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
08713 GFP_KERNEL, cpu_to_node(i));
08714 if (!cfs_rq)
08715 goto err;
08716
08717 se = kzalloc_node(sizeof(struct sched_entity),
08718 GFP_KERNEL, cpu_to_node(i));
08719 if (!se)
08720 goto err;
08721
08722 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
08723 }
08724
08725 return 1;
08726
08727 err:
08728 return 0;
08729 }
08730
08731 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
08732 {
08733 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
08734 &cpu_rq(cpu)->leaf_cfs_rq_list);
08735 }
08736
08737 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
08738 {
08739 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
08740 }
08741 #else
08742 static inline void free_fair_sched_group(struct task_group *tg)
08743 {
08744 }
08745
08746 static inline
08747 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
08748 {
08749 return 1;
08750 }
08751
08752 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
08753 {
08754 }
08755
08756 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
08757 {
08758 }
08759 #endif
08760
08761 #ifdef CONFIG_RT_GROUP_SCHED
08762 static void free_rt_sched_group(struct task_group *tg)
08763 {
08764 int i;
08765
08766 destroy_rt_bandwidth(&tg->rt_bandwidth);
08767
08768 for_each_possible_cpu(i) {
08769 if (tg->rt_rq)
08770 kfree(tg->rt_rq[i]);
08771 if (tg->rt_se)
08772 kfree(tg->rt_se[i]);
08773 }
08774
08775 kfree(tg->rt_rq);
08776 kfree(tg->rt_se);
08777 }
08778
08779 static
08780 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
08781 {
08782 struct rt_rq *rt_rq;
08783 struct sched_rt_entity *rt_se;
08784 struct rq *rq;
08785 int i;
08786
08787 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
08788 if (!tg->rt_rq)
08789 goto err;
08790 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
08791 if (!tg->rt_se)
08792 goto err;
08793
08794 init_rt_bandwidth(&tg->rt_bandwidth,
08795 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
08796
08797 for_each_possible_cpu(i) {
08798 rq = cpu_rq(i);
08799
08800 rt_rq = kzalloc_node(sizeof(struct rt_rq),
08801 GFP_KERNEL, cpu_to_node(i));
08802 if (!rt_rq)
08803 goto err;
08804
08805 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
08806 GFP_KERNEL, cpu_to_node(i));
08807 if (!rt_se)
08808 goto err;
08809
08810 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
08811 }
08812
08813 return 1;
08814
08815 err:
08816 return 0;
08817 }
08818
08819 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
08820 {
08821 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
08822 &cpu_rq(cpu)->leaf_rt_rq_list);
08823 }
08824
08825 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
08826 {
08827 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
08828 }
08829 #else
08830 static inline void free_rt_sched_group(struct task_group *tg)
08831 {
08832 }
08833
08834 static inline
08835 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
08836 {
08837 return 1;
08838 }
08839
08840 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
08841 {
08842 }
08843
08844 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
08845 {
08846 }
08847 #endif
08848
08849 #ifdef CONFIG_GROUP_SCHED
08850 static void free_sched_group(struct task_group *tg)
08851 {
08852 free_fair_sched_group(tg);
08853 free_rt_sched_group(tg);
08854 kfree(tg);
08855 }
08856
08857
08858 struct task_group *sched_create_group(struct task_group *parent)
08859 {
08860 struct task_group *tg;
08861 unsigned long flags;
08862 int i;
08863
08864 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
08865 if (!tg)
08866 return ERR_PTR(-ENOMEM);
08867
08868 if (!alloc_fair_sched_group(tg, parent))
08869 goto err;
08870
08871 if (!alloc_rt_sched_group(tg, parent))
08872 goto err;
08873
08874 spin_lock_irqsave(&task_group_lock, flags);
08875 for_each_possible_cpu(i) {
08876 register_fair_sched_group(tg, i);
08877 register_rt_sched_group(tg, i);
08878 }
08879 list_add_rcu(&tg->list, &task_groups);
08880
08881 WARN_ON(!parent);
08882
08883 tg->parent = parent;
08884 INIT_LIST_HEAD(&tg->children);
08885 list_add_rcu(&tg->siblings, &parent->children);
08886 spin_unlock_irqrestore(&task_group_lock, flags);
08887
08888 return tg;
08889
08890 err:
08891 free_sched_group(tg);
08892 return ERR_PTR(-ENOMEM);
08893 }
08894
08895
08896 static void free_sched_group_rcu(struct rcu_head *rhp)
08897 {
08898
08899 free_sched_group(container_of(rhp, struct task_group, rcu));
08900 }
08901
08902
08903 void sched_destroy_group(struct task_group *tg)
08904 {
08905 unsigned long flags;
08906 int i;
08907
08908 spin_lock_irqsave(&task_group_lock, flags);
08909 for_each_possible_cpu(i) {
08910 unregister_fair_sched_group(tg, i);
08911 unregister_rt_sched_group(tg, i);
08912 }
08913 list_del_rcu(&tg->list);
08914 list_del_rcu(&tg->siblings);
08915 spin_unlock_irqrestore(&task_group_lock, flags);
08916
08917
08918 call_rcu(&tg->rcu, free_sched_group_rcu);
08919 }
08920
08921
08922
08923
08924
08925
08926 void sched_move_task(struct task_struct *tsk)
08927 {
08928 int on_rq, running;
08929 unsigned long flags;
08930 struct rq *rq;
08931
08932 rq = task_rq_lock(tsk, &flags);
08933
08934 update_rq_clock(rq);
08935
08936 running = task_current(rq, tsk);
08937 on_rq = tsk->se.on_rq;
08938
08939 if (on_rq)
08940 dequeue_task(rq, tsk, 0);
08941 if (unlikely(running))
08942 tsk->sched_class->put_prev_task(rq, tsk);
08943
08944 set_task_rq(tsk, task_cpu(tsk));
08945
08946 #ifdef CONFIG_FAIR_GROUP_SCHED
08947 if (tsk->sched_class->moved_group)
08948 tsk->sched_class->moved_group(tsk);
08949 #endif
08950
08951 if (unlikely(running))
08952 tsk->sched_class->set_curr_task(rq);
08953 if (on_rq)
08954 enqueue_task(rq, tsk, 0);
08955
08956 task_rq_unlock(rq, &flags);
08957 }
08958 #endif
08959
08960 #ifdef CONFIG_FAIR_GROUP_SCHED
08961 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
08962 {
08963 struct cfs_rq *cfs_rq = se->cfs_rq;
08964 int on_rq;
08965
08966 on_rq = se->on_rq;
08967 if (on_rq)
08968 dequeue_entity(cfs_rq, se, 0);
08969
08970 se->load.weight = shares;
08971 se->load.inv_weight = 0;
08972
08973 if (on_rq)
08974 enqueue_entity(cfs_rq, se, 0);
08975 }
08976
08977 static void set_se_shares(struct sched_entity *se, unsigned long shares)
08978 {
08979 struct cfs_rq *cfs_rq = se->cfs_rq;
08980 struct rq *rq = cfs_rq->rq;
08981 unsigned long flags;
08982
08983 spin_lock_irqsave(&rq->lock, flags);
08984 __set_se_shares(se, shares);
08985 spin_unlock_irqrestore(&rq->lock, flags);
08986 }
08987
08988 static DEFINE_MUTEX(shares_mutex);
08989
08990 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
08991 {
08992 int i;
08993 unsigned long flags;
08994
08995
08996
08997
08998 if (!tg->se[0])
08999 return -EINVAL;
09000
09001 if (shares < MIN_SHARES)
09002 shares = MIN_SHARES;
09003 else if (shares > MAX_SHARES)
09004 shares = MAX_SHARES;
09005
09006 mutex_lock(&shares_mutex);
09007 if (tg->shares == shares)
09008 goto done;
09009
09010 spin_lock_irqsave(&task_group_lock, flags);
09011 for_each_possible_cpu(i)
09012 unregister_fair_sched_group(tg, i);
09013 list_del_rcu(&tg->siblings);
09014 spin_unlock_irqrestore(&task_group_lock, flags);
09015
09016
09017 synchronize_sched();
09018
09019
09020
09021
09022
09023 tg->shares = shares;
09024 for_each_possible_cpu(i) {
09025
09026
09027
09028 cfs_rq_set_shares(tg->cfs_rq[i], 0);
09029 set_se_shares(tg->se[i], shares);
09030 }
09031
09032
09033
09034
09035
09036 spin_lock_irqsave(&task_group_lock, flags);
09037 for_each_possible_cpu(i)
09038 register_fair_sched_group(tg, i);
09039 list_add_rcu(&tg->siblings, &tg->parent->children);
09040 spin_unlock_irqrestore(&task_group_lock, flags);
09041 done:
09042 mutex_unlock(&shares_mutex);
09043 return 0;
09044 }
09045
09046 unsigned long sched_group_shares(struct task_group *tg)
09047 {
09048 return tg->shares;
09049 }
09050 #endif
09051
09052 #ifdef CONFIG_RT_GROUP_SCHED
09053
09054
09055
09056 static DEFINE_MUTEX(rt_constraints_mutex);
09057
09058 static unsigned long to_ratio(u64 period, u64 runtime)
09059 {
09060 if (runtime == RUNTIME_INF)
09061 return 1ULL << 20;
09062
09063 return div64_u64(runtime << 20, period);
09064 }
09065
09066
09067 static inline int tg_has_rt_tasks(struct task_group *tg)
09068 {
09069 struct task_struct *g, *p;
09070
09071 do_each_thread(g, p) {
09072 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
09073 return 1;
09074 } while_each_thread(g, p);
09075
09076 return 0;
09077 }
09078
09079 struct rt_schedulable_data {
09080 struct task_group *tg;
09081 u64 rt_period;
09082 u64 rt_runtime;
09083 };
09084
09085 static int tg_schedulable(struct task_group *tg, void *data)
09086 {
09087 struct rt_schedulable_data *d = data;
09088 struct task_group *child;
09089 unsigned long total, sum = 0;
09090 u64 period, runtime;
09091
09092 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
09093 runtime = tg->rt_bandwidth.rt_runtime;
09094
09095 if (tg == d->tg) {
09096 period = d->rt_period;
09097 runtime = d->rt_runtime;
09098 }
09099
09100 #ifdef CONFIG_USER_SCHED
09101 if (tg == &root_task_group) {
09102 period = global_rt_period();
09103 runtime = global_rt_runtime();
09104 }
09105 #endif
09106
09107
09108
09109
09110 if (runtime > period && runtime != RUNTIME_INF)
09111 return -EINVAL;
09112
09113
09114
09115
09116 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
09117 return -EBUSY;
09118
09119 total = to_ratio(period, runtime);
09120
09121
09122
09123
09124 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
09125 return -EINVAL;
09126
09127
09128
09129
09130 list_for_each_entry_rcu(child, &tg->children, siblings) {
09131 period = ktime_to_ns(child->rt_bandwidth.rt_period);
09132 runtime = child->rt_bandwidth.rt_runtime;
09133
09134 if (child == d->tg) {
09135 period = d->rt_period;
09136 runtime = d->rt_runtime;
09137 }
09138
09139 sum += to_ratio(period, runtime);
09140 }
09141
09142 if (sum > total)
09143 return -EINVAL;
09144
09145 return 0;
09146 }
09147
09148 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
09149 {
09150 struct rt_schedulable_data data = {
09151 .tg = tg,
09152 .rt_period = period,
09153 .rt_runtime = runtime,
09154 };
09155
09156 return walk_tg_tree(tg_schedulable, tg_nop, &data);
09157 }
09158
09159 static int tg_set_bandwidth(struct task_group *tg,
09160 u64 rt_period, u64 rt_runtime)
09161 {
09162 int i, err = 0;
09163
09164 mutex_lock(&rt_constraints_mutex);
09165 read_lock(&tasklist_lock);
09166 err = __rt_schedulable(tg, rt_period, rt_runtime);
09167 if (err)
09168 goto unlock;
09169
09170 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
09171 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
09172 tg->rt_bandwidth.rt_runtime = rt_runtime;
09173
09174 for_each_possible_cpu(i) {
09175 struct rt_rq *rt_rq = tg->rt_rq[i];
09176
09177 spin_lock(&rt_rq->rt_runtime_lock);
09178 rt_rq->rt_runtime = rt_runtime;
09179 spin_unlock(&rt_rq->rt_runtime_lock);
09180 }
09181 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
09182 unlock:
09183 read_unlock(&tasklist_lock);
09184 mutex_unlock(&rt_constraints_mutex);
09185
09186 return err;
09187 }
09188
09189 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
09190 {
09191 u64 rt_runtime, rt_period;
09192
09193 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
09194 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
09195 if (rt_runtime_us < 0)
09196 rt_runtime = RUNTIME_INF;
09197
09198 return tg_set_bandwidth(tg, rt_period, rt_runtime);
09199 }
09200
09201 long sched_group_rt_runtime(struct task_group *tg)
09202 {
09203 u64 rt_runtime_us;
09204
09205 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
09206 return -1;
09207
09208 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
09209 do_div(rt_runtime_us, NSEC_PER_USEC);
09210 return rt_runtime_us;
09211 }
09212
09213 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
09214 {
09215 u64 rt_runtime, rt_period;
09216
09217 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
09218 rt_runtime = tg->rt_bandwidth.rt_runtime;
09219
09220 if (rt_period == 0)
09221 return -EINVAL;
09222
09223 return tg_set_bandwidth(tg, rt_period, rt_runtime);
09224 }
09225
09226 long sched_group_rt_period(struct task_group *tg)
09227 {
09228 u64 rt_period_us;
09229
09230 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
09231 do_div(rt_period_us, NSEC_PER_USEC);
09232 return rt_period_us;
09233 }
09234
09235 static int sched_rt_global_constraints(void)
09236 {
09237 u64 runtime, period;
09238 int ret = 0;
09239
09240 if (sysctl_sched_rt_period <= 0)
09241 return -EINVAL;
09242
09243 runtime = global_rt_runtime();
09244 period = global_rt_period();
09245
09246
09247
09248
09249 if (runtime > period && runtime != RUNTIME_INF)
09250 return -EINVAL;
09251
09252 mutex_lock(&rt_constraints_mutex);
09253 read_lock(&tasklist_lock);
09254 ret = __rt_schedulable(NULL, 0, 0);
09255 read_unlock(&tasklist_lock);
09256 mutex_unlock(&rt_constraints_mutex);
09257
09258 return ret;
09259 }
09260
09261 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
09262 {
09263
09264 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
09265 return 0;
09266
09267 return 1;
09268 }
09269
09270 #else
09271 static int sched_rt_global_constraints(void)
09272 {
09273 unsigned long flags;
09274 int i;
09275
09276 if (sysctl_sched_rt_period <= 0)
09277 return -EINVAL;
09278
09279 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
09280 for_each_possible_cpu(i) {
09281 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
09282
09283 spin_lock(&rt_rq->rt_runtime_lock);
09284 rt_rq->rt_runtime = global_rt_runtime();
09285 spin_unlock(&rt_rq->rt_runtime_lock);
09286 }
09287 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
09288
09289 return 0;
09290 }
09291 #endif
09292
09293 int sched_rt_handler(struct ctl_table *table, int write,
09294 struct file *filp, void __user *buffer, size_t *lenp,
09295 loff_t *ppos)
09296 {
09297 int ret;
09298 int old_period, old_runtime;
09299 static DEFINE_MUTEX(mutex);
09300
09301 mutex_lock(&mutex);
09302 old_period = sysctl_sched_rt_period;
09303 old_runtime = sysctl_sched_rt_runtime;
09304
09305 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
09306
09307 if (!ret && write) {
09308 ret = sched_rt_global_constraints();
09309 if (ret) {
09310 sysctl_sched_rt_period = old_period;
09311 sysctl_sched_rt_runtime = old_runtime;
09312 } else {
09313 def_rt_bandwidth.rt_runtime = global_rt_runtime();
09314 def_rt_bandwidth.rt_period =
09315 ns_to_ktime(global_rt_period());
09316 }
09317 }
09318 mutex_unlock(&mutex);
09319
09320 return ret;
09321 }
09322
09323 #ifdef CONFIG_CGROUP_SCHED
09324
09325
09326 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
09327 {
09328 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
09329 struct task_group, css);
09330 }
09331
09332 static struct cgroup_subsys_state *
09333 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
09334 {
09335 struct task_group *tg, *parent;
09336
09337 if (!cgrp->parent) {
09338
09339 return &init_task_group.css;
09340 }
09341
09342 parent = cgroup_tg(cgrp->parent);
09343 tg = sched_create_group(parent);
09344 if (IS_ERR(tg))
09345 return ERR_PTR(-ENOMEM);
09346
09347 return &tg->css;
09348 }
09349
09350 static void
09351 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
09352 {
09353 struct task_group *tg = cgroup_tg(cgrp);
09354
09355 sched_destroy_group(tg);
09356 }
09357
09358 static int
09359 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
09360 struct task_struct *tsk)
09361 {
09362 #ifdef CONFIG_RT_GROUP_SCHED
09363 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
09364 return -EINVAL;
09365 #else
09366
09367 if (tsk->sched_class != &fair_sched_class)
09368 return -EINVAL;
09369 #endif
09370
09371 return 0;
09372 }
09373
09374 static void
09375 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
09376 struct cgroup *old_cont, struct task_struct *tsk)
09377 {
09378 sched_move_task(tsk);
09379 }
09380
09381 #ifdef CONFIG_FAIR_GROUP_SCHED
09382 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
09383 u64 shareval)
09384 {
09385 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
09386 }
09387
09388 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
09389 {
09390 struct task_group *tg = cgroup_tg(cgrp);
09391
09392 return (u64) tg->shares;
09393 }
09394 #endif
09395
09396 #ifdef CONFIG_RT_GROUP_SCHED
09397 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
09398 s64 val)
09399 {
09400 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
09401 }
09402
09403 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
09404 {
09405 return sched_group_rt_runtime(cgroup_tg(cgrp));
09406 }
09407
09408 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
09409 u64 rt_period_us)
09410 {
09411 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
09412 }
09413
09414 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
09415 {
09416 return sched_group_rt_period(cgroup_tg(cgrp));
09417 }
09418 #endif
09419
09420 static struct cftype cpu_files[] = {
09421 #ifdef CONFIG_FAIR_GROUP_SCHED
09422 {
09423 .name = "shares",
09424 .read_u64 = cpu_shares_read_u64,
09425 .write_u64 = cpu_shares_write_u64,
09426 },
09427 #endif
09428 #ifdef CONFIG_RT_GROUP_SCHED
09429 {
09430 .name = "rt_runtime_us",
09431 .read_s64 = cpu_rt_runtime_read,
09432 .write_s64 = cpu_rt_runtime_write,
09433 },
09434 {
09435 .name = "rt_period_us",
09436 .read_u64 = cpu_rt_period_read_uint,
09437 .write_u64 = cpu_rt_period_write_uint,
09438 },
09439 #endif
09440 };
09441
09442 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
09443 {
09444 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
09445 }
09446
09447 struct cgroup_subsys cpu_cgroup_subsys = {
09448 .name = "cpu",
09449 .create = cpu_cgroup_create,
09450 .destroy = cpu_cgroup_destroy,
09451 .can_attach = cpu_cgroup_can_attach,
09452 .attach = cpu_cgroup_attach,
09453 .populate = cpu_cgroup_populate,
09454 .subsys_id = cpu_cgroup_subsys_id,
09455 .early_init = 1,
09456 };
09457
09458 #endif
09459
09460 #ifdef CONFIG_CGROUP_CPUACCT
09461
09462
09463
09464
09465
09466
09467
09468
09469
09470 struct cpuacct {
09471 struct cgroup_subsys_state css;
09472
09473 u64 *cpuusage;
09474 struct cpuacct *parent;
09475 };
09476
09477 struct cgroup_subsys cpuacct_subsys;
09478
09479
09480 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
09481 {
09482 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
09483 struct cpuacct, css);
09484 }
09485
09486
09487 static inline struct cpuacct *task_ca(struct task_struct *tsk)
09488 {
09489 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
09490 struct cpuacct, css);
09491 }
09492
09493
09494 static struct cgroup_subsys_state *cpuacct_create(
09495 struct cgroup_subsys *ss, struct cgroup *cgrp)
09496 {
09497 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
09498
09499 if (!ca)
09500 return ERR_PTR(-ENOMEM);
09501
09502 ca->cpuusage = alloc_percpu(u64);
09503 if (!ca->cpuusage) {
09504 kfree(ca);
09505 return ERR_PTR(-ENOMEM);
09506 }
09507
09508 if (cgrp->parent)
09509 ca->parent = cgroup_ca(cgrp->parent);
09510
09511 return &ca->css;
09512 }
09513
09514
09515 static void
09516 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
09517 {
09518 struct cpuacct *ca = cgroup_ca(cgrp);
09519
09520 free_percpu(ca->cpuusage);
09521 kfree(ca);
09522 }
09523
09524 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
09525 {
09526 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
09527 u64 data;
09528
09529 #ifndef CONFIG_64BIT
09530
09531
09532
09533 spin_lock_irq(&cpu_rq(cpu)->lock);
09534 data = *cpuusage;
09535 spin_unlock_irq(&cpu_rq(cpu)->lock);
09536 #else
09537 data = *cpuusage;
09538 #endif
09539
09540 return data;
09541 }
09542
09543 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
09544 {
09545 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
09546
09547 #ifndef CONFIG_64BIT
09548
09549
09550
09551 spin_lock_irq(&cpu_rq(cpu)->lock);
09552 *cpuusage = val;
09553 spin_unlock_irq(&cpu_rq(cpu)->lock);
09554 #else
09555 *cpuusage = val;
09556 #endif
09557 }
09558
09559
09560 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
09561 {
09562 struct cpuacct *ca = cgroup_ca(cgrp);
09563 u64 totalcpuusage = 0;
09564 int i;
09565
09566 for_each_present_cpu(i)
09567 totalcpuusage += cpuacct_cpuusage_read(ca, i);
09568
09569 return totalcpuusage;
09570 }
09571
09572 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
09573 u64 reset)
09574 {
09575 struct cpuacct *ca = cgroup_ca(cgrp);
09576 int err = 0;
09577 int i;
09578
09579 if (reset) {
09580 err = -EINVAL;
09581 goto out;
09582 }
09583
09584 for_each_present_cpu(i)
09585 cpuacct_cpuusage_write(ca, i, 0);
09586
09587 out:
09588 return err;
09589 }
09590
09591 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
09592 struct seq_file *m)
09593 {
09594 struct cpuacct *ca = cgroup_ca(cgroup);
09595 u64 percpu;
09596 int i;
09597
09598 for_each_present_cpu(i) {
09599 percpu = cpuacct_cpuusage_read(ca, i);
09600 seq_printf(m, "%llu ", (unsigned long long) percpu);
09601 }
09602 seq_printf(m, "\n");
09603 return 0;
09604 }
09605
09606 static struct cftype files[] = {
09607 {
09608 .name = "usage",
09609 .read_u64 = cpuusage_read,
09610 .write_u64 = cpuusage_write,
09611 },
09612 {
09613 .name = "usage_percpu",
09614 .read_seq_string = cpuacct_percpu_seq_read,
09615 },
09616
09617 };
09618
09619 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
09620 {
09621 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
09622 }
09623
09624
09625
09626
09627
09628
09629 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
09630 {
09631 struct cpuacct *ca;
09632 int cpu;
09633
09634 if (!cpuacct_subsys.active)
09635 return;
09636
09637 cpu = task_cpu(tsk);
09638 ca = task_ca(tsk);
09639
09640 for (; ca; ca = ca->parent) {
09641 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
09642 *cpuusage += cputime;
09643 }
09644 }
09645
09646 struct cgroup_subsys cpuacct_subsys = {
09647 .name = "cpuacct",
09648 .create = cpuacct_create,
09649 .destroy = cpuacct_destroy,
09650 .populate = cpuacct_populate,
09651 .subsys_id = cpuacct_subsys_id,
09652 };
09653 #endif
09654 #endif