Load balance in Linux source level bottom-up analysis

리눅스에서 load balance는 load_balance()라는 함수와 각 스케쥴링 클래스에 있는 load_balance함수를 를 통해서 이루어진다. 후자의 경우 virtual function table에 의해서 load_balance_fair, load_balance_idle, load_balance_rt로 matching이 된다.

먼저 sched.c에 있는 load_balance()함수는 다음과 같다.

static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *balance)
{
	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
	struct sched_group *group;
	unsigned long imbalance;
	struct rq *busiest;
	unsigned long flags;
	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

	cpumask_copy(cpus, cpu_active_mask);

	/*
	 * When power savings policy is enabled for the parent domain, idle
	 * sibling can pick up load irrespective of busy siblings. In this case,
	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
	 * portraying it as CPU_NOT_IDLE.
	 */
	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
		sd_idle = 1;

	schedstat_inc(sd, lb_count[idle]);

redo:
	update_shares(sd);
	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
				   cpus, balance);

	if (*balance == 0)
		goto out_balanced;

	if (!group) {
		schedstat_inc(sd, lb_nobusyg[idle]);
		goto out_balanced;
	}

	busiest = find_busiest_queue(group, idle, imbalance, cpus);
	if (!busiest) {
		schedstat_inc(sd, lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == this_rq);

	schedstat_add(sd, lb_imbalance[idle], imbalance);

	ld_moved = 0;
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		local_irq_save(flags);
		double_rq_lock(this_rq, busiest);
		ld_moved = move_tasks(this_rq, this_cpu, busiest,
				      imbalance, sd, idle, &all_pinned);
		double_rq_unlock(this_rq, busiest);
		local_irq_restore(flags);

		/*
		 * some other cpu did the load balance for us.
		 */
		if (ld_moved && this_cpu != smp_processor_id())
			resched_cpu(this_cpu);

		/* All tasks on this runqueue were pinned by CPU affinity */
		if (unlikely(all_pinned)) {
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			if (!cpumask_empty(cpus))
				goto redo;
			goto out_balanced;
		}
	}

	if (!ld_moved) {
		schedstat_inc(sd, lb_failed[idle]);
		sd->nr_balance_failed++;

		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

			spin_lock_irqsave(&busiest->lock, flags);

			/* don't kick the migration_thread, if the curr
			 * task on busiest cpu can't be moved to this_cpu
			 */
			if (!cpumask_test_cpu(this_cpu,
					      &busiest->curr->cpus_allowed)) {
				spin_unlock_irqrestore(&busiest->lock, flags);
				all_pinned = 1;
				goto out_one_pinned;
			}

			if (!busiest->active_balance) {
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			spin_unlock_irqrestore(&busiest->lock, flags);
			if (active_balance)
				wake_up_process(busiest->migration_thread);

			/*
			 * We've kicked active balancing, reset the failure
			 * counter.
			 */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
		}
	} else
		sd->nr_balance_failed = 0;

	if (likely(!active_balance)) {
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * move_tasks).
		 */
		if (sd->balance_interval < sd->max_interval)
			sd->balance_interval *= 2;
	}

	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
		ld_moved = -1;

	goto out;

out_balanced:
	schedstat_inc(sd, lb_balanced[idle]);

	sd->nr_balance_failed = 0;

out_one_pinned:
	/* tune up the balancing interval */
	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
			(sd->balance_interval < sd->max_interval))
		sd->balance_interval *= 2;

	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
		ld_moved = -1;
	else
		ld_moved = 0;
out:
	if (ld_moved)
		update_shares(sd);
	return ld_moved;
}

이 함수의 경우 sched.c에 있는 rebalance_domains() 함수에서 호출된다. 이 함수는 run_rebalance_domains()에서 호출이 되며 이 함수는 sched_init에서 softirq(SCHED_SOFTIRQ)로 설정이 되어있다. 이 인터럽트는 trigger_load_balance()에서 호출이 된다. 이 함수는 scheduler_tick()에서 호출이 된다. 이 함수는 update_process_times(int user_tick)에서 호출이 된다. 그리고 이 함수는 인터럽트에서 불리는데 arch/arm의 경우에 timer_tick()에서 호출이 된다.

즉 정리하자면

1. 일정 시간에 한번씩 타이머 인터럽트에서 timer_tick()이 호출
2. timer_tick()에서 update_process_times() 호출
3. update_process_times()에서scheduler_tick() 호출
4. scheduler_tick()에서 trigger_load_balance() 호출
5. trigger_load_balance()에서 SCHED_SOFTIRQ 인터럽트 호출
6. SCHED_SOFTIRQ 인터럽트에 의해서 run_rebalance_domains() 호출
7. run_rebalance_domains()에서 rebalance_domains() 호출
8. rebalance_domains()에서 load_balance() 호출

이와 별개로 각 class별 load_balance함수인 load_balance_fair와 load_balance_idle, load_balance_rt는 task migration을 하는 move_tasks()에서 한다.

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
		      unsigned long max_load_move,
		      struct sched_domain *sd, enum cpu_idle_type idle,
		      int *all_pinned)
{
	const struct sched_class *class = sched_class_highest;
	unsigned long total_load_moved = 0;
	int this_best_prio = this_rq->curr->prio;

	do {
		total_load_moved +=
			class->load_balance(this_rq, this_cpu, busiest,
				max_load_move - total_load_moved,
				sd, idle, all_pinned, &this_best_prio);
		class = class->next;

#ifdef CONFIG_PREEMPT
		/*
		 * NEWIDLE balancing is a source of latency, so preemptible
		 * kernels will stop after the first task is pulled to minimize
		 * the critical section.
		 */
		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
			break;
#endif
	} while (class && max_load_move > total_load_moved);

	return total_load_moved > 0;
}

이 move_task함수는 load_balance와 load_balance_idle에서 호출이 된다. 즉 load_balance의 일환으로 각 rq의 task를 다시 배치하는 과정에서 사용이된다.

linux-2.6.32에서 load_balance와 관련 있는 함수는

move_task()
move_tasks()
iter_move_one_task()
move_one_task(),

get_sd_load_idx(), init_sd_power_saving_stats(), update_sd_power_savings_stats(), check_power_save_busiest_group(), default_scale_freq_power(), arch_scale_freq_power(), default_scale_smt_power(), arch_scale_smt_power(), scale_rt_power(), update_cpu_power(), update_group_power(),

find_busiest_group(), find_busiest_queue()
: 가장 바쁜 cpu group과 그 중에서 runqueue를 찾는 함수.

update_sg_lb_stats(), update_sd_lb_stats(), fix_small_imbalance(), calculate_imbalance()
: load_balancing을 위해서 sched_group과 sched_domain의 statistics를 계산하는 함수

load_balance(), load_balance_newidle(), idle_balance()
: 실제 lb이 일어나는 함수

구조체로는 sd_lb_stats, sg_lb_stats 이 있다.

load_balance() 함수의 동작

1. cpus에 cpu_activate_mask를 복사한다. 현재 돌아가는 cpu mask를 뜻하고 나중에 affinity를 계산하기 위해서 사용된다.
2. 현재 CPU의 상태가 IDLE이고, SD_SHARE_CPUPOWER이고, POWERSAVING이 아니라면 sd_idle을 1로 set한다. 이 변수는 현재 cpu가 IDLE이 되었기 때문에 로드밸런싱을 한다는 것을 의미한다.

redo:
1. group에 find_busiest_group()을 이용해서 가장 바쁜 group을 얻는다.
2. 만약 balance가 0이라면, 즉 문제가 없다면 goto out_balanced
3. 만약 group을 찾지 못했다면 goto out_balanced
4. 위에 조건에 만족하지 않았으면, 즉 로드 밸런싱을 할 필요가 있고, group 역시 찾았다면 find_busiest_queue()를 이용해서 해당 group에서 busiest queue를 찾는다.
5. 만약 busiest가 없다면 goto out_balanced
6. migration 된 수를 의미하는 ld_moved를 0으로 초기화 한다.
7. 앞에서 찾은 busiest 런큐의 속한 task가 1개보다 많다면
7.1 인터럽트를 끄고, this_rq와 busiest를 모두 lock한다. 이는 busiest에서 this_rq로 migration을 위해서이다.
7.2 move_task()를 이용해서 migration을 시킨다. migration한 수는 ld_moved에 저장한다. 이 부분이 실제적 load_balance로 busiest 런큐에 속한 task를 this_cpu의 런큐에 migrate시킨다.
7.3 인터럽트를 다시 키고, rq들을 unlock 한다.
7.4 만약 migration을 했고, this_cpu와 현재 이 함수를 호출하고 있는 CPU가 다르다면 resched_cpu(this_cpu)를 호출한다.
7.5 만약 all_pinned 라면, 즉 현재 task를 migration 시킬 수 없다면?
7.5.1 busiest가 속한 cpu를 cpus에서 clear한다.
7.5.2 만약 cpus의 cpumask가 아직 남아 있다면 goto redo
7.5.3 만약 cpus의 cpumask가 비었다면 goto out_balanced
8. 만약 ld_moved가 0이라면, 즉 migration이 일어나지 않았다면
8.1 sd->nr_balance_failed++, 즉 migration이 실패한 횟수를 count한다.
8.2 nr_balance_failed가 sd->cached_nice_tries+2보다 크다면, 즉 migration이 실패한 횟수가 일정 값보다 커지면
8.2.1 busiest를 lock하고
8.2.2 busiest 런큐의 curr가 this_cpu로 migration할 수 없다면
8.2.2.1 busiest를 unlock한다.
8.2.2.2 all_pinned 를 1로 set한다. 즉 this_cpu로는 migration할 수 없다는 것을 의미하는 것 같다.
8.2.2.3 goto out_one_pinned
8.2.3 만약 busiest 런큐가 현재 active_balance를 하고 있지 않다면

8.2.3.1 busiest->active_balance 를 1로 한다. 즉 8.7 조건을 만족시키지 않게해서 이 scope에는 한번만 들어오게 한다.
8.2.3.2 busiest의 push_cpu = this_cpu로 한다. migration시 this_cpu로 task를 push하라는 의미로 생각된다.
8.2.3.3 active_balance = 1, 이 flag를 보고 뒤에서 migration_thread를 실행시킨다.
8.2.4 busiest를 unlock한다.
8.2.5 만약 active_balance가 1이라면 migration_thread를 실행한다. 이 migration_thread에서 실제 task의 pull이 일어난다. 즉 busy한 rq에서 자신의 task를 push_cpu인 this_cpu(현재 이 함수를 호출하는 cpu)로 migrate시킨다.
8.2.6 sd->nr_balanced_failed = sd->cache_nice_tries+1 로 한다. 8.1과 8.2를 고려하면 다음번에 이 scope에 들어오지 않도록 하는 부분이다.
9 migration이 별로 실패하지 않았으면 sd->nr_balance_failed = 0로 한다.
10 만약 active_balance가 아니라면, 즉 ld_moved가 0보다 크거나, 0이어도 busiest 런큐가 active_balance를 하고 있지 않아서 active_balance를 해야 한다면
10.1 sd->balance_interval을 sd->min_interval로 한다.
11 만약 active_balance가 set 되어 있다면, 즉 ld_moved가 0이고, busiest 런큐가 현재 active_balance를 하고 있다는 의미이다.
11.1 sd_balance_interval이 max보다 작다면 2배로 키운다. 즉 다음번 active_balance를 뒤로 더 미룬다.
12. 만약 migration이 일어나지 않았고, SD_SHARE_CPUPOWER이고, POWERSAVING이 아니라면 ld_moved = -1
13. goto out

out_balanced:
1. sd->nr_balance_failed를 0으로 한다.

out_one_pinned:
1. 만약 ( all_pinned이고, interval이 MAX_PINNED_INTERVAL보다 작거나 ) || ( interval이 max_interval보다 작으면) sd->balance_interval을 두 배로 한다.
2. 만약 sd_idle이 아니고, SD_SHARE_SPUPOWER이고, POWERSAVINGS_BALANCE가 아니라면 ld_moved = -1
3. 만약 그렇지 않다면 ld_moved = 0

out:
1. 만약 ld_move가 0이나 음수라면 update_share(sd)
2. return ld_moved

load_balance_newidle() 함수의 동작

1. cpus에 cpu_activate_mask를 복사한다. 현재 돌아가는 cpu mask를 뜻하고 나중에 affinity를 계산하기 위해서 사용된다.
2. 현재 CPU의 상태가 IDLE이고, SD_SHARE_CPU_POWER이고, powersaving을 위해 로드밸런싱을 하는 것이 아니라면 sd_idle이라는 로컬 변수를 1로 set한다. 즉 현재 cpu가 IDLE이 되었기 때문에 로드 밸런싱을 하겠다는 의미이다.

redo:
1. group에 find_busiest_group()을 이용해서 가장 바쁜 group을 얻는다.
2. 만약 balance가 0이라면, 즉 문제가 없다면 goto out_balanced
3. 만약 group을 찾지 못했다면 goto out_balanced
4. 위에 조건에 만족하지 않았으면, 즉 로드 밸런싱을 할 필요가 있고, group 역시 찾았다면 find_busiest_queue()를 이용해서 해당 group에서 busiest queue를 찾는다.
5. 만약 busiest가 없다면 goto out_balanced
6. migration 된 수를 의미하는 ld_moved를 0으로 초기화 한다.
7. 앞에서 찾은 busiest 런큐의 속한 task가 1개보다 많다면
7.1 인터럽트를 끄고, this_rq와 busiest를 모두 lock한다. 이는 busiest에서 this_rq로 migration을 위해서이다.
7.2 move_task()를 이용해서 migration을 시킨다. migration한 수는 ld_moved에 저장한다.
7.3 인터럽트를 다시 키고, rq들을 unlock 한다.
7.4 만약 migration을 했고, this_cpu와 현재 이 함수를 호출하고 있는 CPU가 다르다면 resched_cpu(this_cpu)를 호출한다.
7.5 만약 all_pinned 라면, 즉 현재 task를 migration 시킬 수 없다면?
7.5.1 busiest가 속한 cpu를 cpus에서 clear한다.
7.5.2 만약 cpus의 cpumask가 아직 남아 있다면 goto redo
8. 만약 ld_moved가 0이라면, 즉 migration이 일어나지 않았다면
8.1 active_balance 변수를 0으로 한다.
8.2 만약 현재 sd_idle이 0이고, SD_SHARE_CPU_POWER이고, SD가 POWER_SAVINGS_BALANCE가 아니라면 return -1
8.3 만약 sched_mc_power_savings가 POWERSAVINGS_BALANCE_WAKEUP보다 작다면 return -1
8.4 만약 sd->nr_balance_failed++ 가 2보다 작다면 return -1
- 주석: idle상태가 아닌 cpu에 있는 task만이 this_cpu로 옮겨질 수 있다. imbalance로 인한 일반적인 task pull의 경우 move_task가 성공해서 nr_moved는 0보다 큰 값을 가질 것이다. 그렇다면 이 scope안에 들어오지 않을 것이고, active_balance() 역시 불리지 않을 것이다.
8.5 this_rq와 busiest를 lock한다.
8.6 busiest 런큐의 curr가 허용한 cpu에 this_cpu가 속해 있지 않다면
8.6.1 this_rq와 busiest를 unlock하고, all_pinned = 1로 한다.
8.6.2 return ld_moved, 이 때 ld_moved는 0이다. 결국 return 0이다.
8.7 만약 busiest 런큐가 현재 active_balance를 하고 있지 않다면
8.7.1 busiest->active_balance 를 1로 한다. 즉 8.7 조건을 만족시키지 않게해서 이 scope에는 한번만 들어오게 한다.
8.7.2 busiest의 push_cpu = this_cpu로 한다. migration시 this_cpu로 task를 push하라는 의미로 생각된다.
8.7.3 active_balance = 1, 이 flag를 보고 뒤에서 migration_thread를 실행시킨다.
8.8 this_rq와 busiest를 unlock하고, this_rq 역시 unlock한다.
8.9 앞에서 setting한 active_balance를 확인해서 busiest런큐에 있는 migration_thread를 킨다. 이 쓰레드에서 액티브 로드 밸런싱을 수행한다. 이 과정에서 this_cpu로 task를 push한다.
8.10 this_rq를 lock한다.
9. ld_moved가 0이 아니라면, 즉 migration된 task가 있다면 nr_balance_failed를 0으로 set한다.
10. update_shared_locked 호출
11. return ld_moved,

out_balanced:
1. 만약 sd_idle이 아니고, SD_SHARE_CPUPOWER이고, POWERSAVING이 아니라면 return -1
2. sd->nr_balance_failed = 0;
3. return 0

find_busiest_group() 함수의 동작

Argument
sd: busiest_group이 return될 domain
this_cpu: load_balancing이 현재 일어나는 cpu
imbalance: group이 idle이 되기 위해서 얼마나 많은 weighted load들이 move되야하는지를 나타내는 값
idle: this_cpu의 현재 idle type
sd_idle: sd의 idle type
cpus: load_balancing이 수행될 수 있는 cpu
balance: this_cpu가 현시점에 load_balancing을 하기에 적절한지 여부를 나타내는 포인터

1. update_sd_lb_stats() 함수의 호출, load_balance에 필요한 변수들을 sd_lb_stats 구조체의 sds에 저장한다. 이 부분에서 기본적인 계산을 다 한다. busiest_group를 찾는 작업도 실제적으로는 이곳에서 수행된다.

struct sd_lb_stats {
	struct sched_group *busiest; /* Busiest group in this sd */
	struct sched_group *this;  /* Local group in this sd */
	unsigned long total_load;  /* Total load of all groups in sd */
	unsigned long total_pwr;   /*	Total power of all groups in sd */
	unsigned long avg_load;	   /* Average load across all groups in sd */

	/** Statistics of this group */
	unsigned long this_load;
	unsigned long this_load_per_task;
	unsigned long this_nr_running;

	/* Statistics of the busiest group */
	unsigned long max_load;
	unsigned long busiest_load_per_task;
	unsigned long busiest_nr_running;

	int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
	int power_savings_balance; /* Is powersave balance needed for this sd */
	struct sched_group *group_min; /* Least loaded group in sd */
	struct sched_group *group_leader; /* Group which relieves group_min */
	unsigned long min_load_per_task; /* load_per_task in group_min */
	unsigned long leader_nr_running; /* Nr running of group_leader */
	unsigned long min_nr_running; /* Nr running of group_min */
#endif
};

2. 만약 balance가 NULL이 아니고, 그 포인터가 가리키는 값이 0이라면, 즉 이 cpu가 lb를 하기에 적당한 cpu가 아니라면, goto ret
3. 만약 busiest가 없거나, busiest에 속한 task가 없다면, 즉 바쁜 group이 없다면 goto out_balanced
4. 만약 this_group의 load가 max_load보다 크다면, 즉 현재 속한 group이 제일 바쁘다면 goto out_balanced
5. sds의 평균 로드를 계산한다. 이는 sds의 총 load를 pwr로 나눈 것에 SCHED_LOAD_SCALE을 곱해서 계산한다.
6. 만약 현재 load가 sds의 평균 load보다 크다면, 즉 this_group이 다른 sd의 group보다 바쁘다면, goto out_balanced
7. 만약 max_load가 일정 범위 값보다 작다면, 즉 lb를 할 필요가 없으면 goto out_balanced
8. sds.busiest_load_per_task를 계산한다. (busiest의 task 수로 나눈다. )
9. sds에 group_imb가 set되어있다면 busiest_load_per_task를 자신과 avg_load 중에서 더 작은 값으로 고른다.
10. 만약 max_load가 busiest_load_per_task보다 작다면, 즉 busiest의 가장 큰 load가 busiest group의 load의 평균보다 작다면, task migration을 하면, 다음 번에 다시 load_balance를 해야하는 상황(ping pong)이 오기 때문에 lb를 하지 않는다. 다시 말해서 ping pong lb를 줄이기 위해서 goto out_balanced
11. calculate_imbalance(&sds, this_cpu, imbalance) 호출
12. return sds.busiest

out_balanced: obvious imbalance가 없어서 이곳으로 왔다. 하지만 save power를 위해 할 것이 있나 확인
1. 만약 save power할 것이 있다면 return sds.busiest , save power를 확인하기 위해서 check_power_save_busiest_group() 호출

ret: 아무것도 하지 않고 NULL 리턴
1. *imbalance = 0;
2. return NULL

update_sd_lb_stats() 함수의 동작: sched_domain의 stat를 update

Argument
sd: busiest_group이 return될 domain
this_cpu: load_balancing이 현재 일어나는 cpu
idle: this_cpu의 현재 idle type
sd_idle: sd의 idle type
cpus: load_balancing이 수행될 수 있는 cpu
balance: this_cpu가 현시점에 load_balancing을 하기에 적절한지 여부를 나타내는 포인터
sds: 정보를 저장할 구조체

1. struct sched_domain *child = sd->child
2. struct sched_group *group = sd->groups
3. 만약 child가 존재하고, child의 flag가 SD_PREFER_SIBLING 이면 prefer_sibling = 1
4. sds를 init_sd_power_savings_stats()를 이용해서 초기화 한다.
5. load_idx = get_sd_load_idx(sd, idle);

do 체크하는 group이 sd의 group안에 있을 동안은 계속 loop를 돈다.
1. local_group 을 알아낸다. 즉 this_cpu가 group에 속했는지 여부를 판단한다. 즉 현재 lb을 호출한 cpu가 migration이 일어날 sched_domain에 포함되어 있는지 여부를 확인한다.
2. sgs 구조체에 메모리를 할당하고 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs) 함수를 이용해서 값을 넣는다.

struct sg_lb_stats {
	unsigned long avg_load; /*Avg load across the CPUs of the group */
	unsigned long group_load; /* Total load over the CPUs of the group */
	unsigned long sum_nr_running; /* Nr tasks running in the group */
	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
	unsigned long group_capacity;
	int group_imb; /* Is there an imbalance in the group ? */
};

3. 만약 local_group이고, balance가 0으로 되어있다면 return;
4. sds->total_load += sgs.group_load; sds->total_pwr += group->cpu_power; 현재 sd의 총 load와 power(작업량)을 계산한다.
5. 만약 prefer_sibling이라면, sgs.group_capacity = min(sgs.group_capacity, 1UL);
6. 만약 local_group이라면, 즉 this_cpu가 sd에 포함되어 있다면
6.1 sds->this_load = sgs.avg_load;

6.2 sds->this = group;

6.3 sds->this_nr_running = sgs.sum_nr_running;

6.4 sds->this_load_per_task = sgs.sum_weighted_load;
7. 만약 local_group이 아니고, sgs의 avg_load가 sds의 max_load보다 크고, sgs의 task 총 수가 sgs의 group_capacity보다 많거나 , sgs의 group_imb가 set되어있다면
7.1 sds->max_load = sgs.avg_load;

7.2 sds->busiest = group;

7.3 sds->busiest_nr_running = sgs.sum_nr_running;

7.4 sds->busiest_load_per_task = sgs.sum_weighted_load;

7.5 sds->group_imb = sgs.group_imb;

8. update_sd_power_savings_stats(group, sds, local_group, &sgs);
9. group = group->next;

while(group != sd->groups)
즉 체크하는 group이 sd의 group안에 있을 때는 계속 do로 돌아가서 loop

update_sg_lb_stats() 함수의 동작: sched_group의 stat를 update

Argument
sd: busiest_group이 return될 domain
group: statistic이 update될 group
this_cpu: load_balancing이 현재 일어나는 cpu
idle: this_cpu의 현재 idle type
load_idx: load 계산을 위해서 this_cpu의 sched_domain의 Load index
sd_idle: sd의 idle type
local_group: group안에 this_cpu가 있는지 여부
cpus: load_balancing이 수행될 수 있는 cpu
balance: this_cpu가 현시점에 load_balancing을 하기에 적절한지 여부를 나타내는 포인터
sgs: 정보를 저장할 구조체

1. 만약 local_group이라면 balance_cpu = group의 첫번째 cpu로 한다.
1.1 만약 balance_cpu 가 this_cpu라면 update_group_power()를 호출한다. 이 함수는 sd의 group의 총 cpu_power를 계산해서 넣어주는 함수이다.
2. sum_avg_laod_per_task와 avg_load_per_task를 0으로 하고, max_cpu_load를 0으로, min_cpu_load를 ~0UL으로 해서, 이 값에 각각 최대값과 최소값을 계산하기 용이하게 해놓는다.

for_each_cpu_and(i, sched_groups_cpus(group), cpus) lb이 수행되는 cpu들에 있는 cpu들을 모두 loop
1. 해당하는 cpu의 rq를 얻는다.
2. 현재 sd가 idle이고(sd_idle), 얻은 rq에 task가 1개 이상 있다면 sd_idle을 0으로 한다. 즉 현재 cpu가 idle이 아니라는 뜻이다.
3. 만약 local_group이라면, 즉 this_cpu가 sd에 포함되어 있다면
3.1 만약 현재 확인하는 cpu가 idle이고, 처음 확인한 것 이라면(first_idle_cpu를 이용해서 이 루틴은 한번만 들어오게 한다.)
3.1.1 first_idle_cpu = 1, 이 scope에 한번만 들어오게 하는 flag
3.1.2 현재 cpu를 balance_cpu로 한다. 현재가 idle이기 때문에 이 cpu로 migration을 시키자. (이 부분은 한번만 들어온다.!)
3.2 현재 확인하는 cpu의 load를 load에 저장한다. 이렇게 하는 이유는 sgs의 group_load를 얻기 위해서인데, 이는 cpu_group의 총 로드의 합이다.
4. 만약 local_group이 아니라면
4.1 현재 확인하는 cpu의 load를 load에 저장한다. 이렇게 하는 이유는 sgs의 group_load를 얻기 위해서인데, 이는 cpu_group의 총 로드의 합이다.
4.2 위에서 얻은 load의 최소값과 최대값을 track한다.
5. 위에서 구한 load의 합을 sgs->group_load로 저장한다.
6. sgs->sun_nr_running에 rq->nr_running을 저장한다. 즉 총 task의 수를 저장한다.
7. sgs->sum_weighted_load에 각 cpu의 weighted_cpuload를 다 더한다.
8. sum_avg_load_per_task += cpu_avg_load_per_task(i)

loop 밖
9. 만약 cpu가 지금 바로 idle이 되지 않았고, local_group이고, balance_cpu가 this_cpu가 아니고, balance를 해야 하면
- balance를 0으로 하고, 즉 현 시점에서 lb를 하는게 적절하지 않다고 하고 return
10. sgs->avg_load,즉 전체 load를 cpu_power로 나눈 평균 load을 계산한다.
11. avg_load_per_task를 계산한다. 이 값은 각 task 별 평균 로드를 뜻한다. 이 값 역시 cpu_power로 나눠진 값이다.
12. 만약 각 group에 속한 cpu의 max_load와 min_load의 차이가 avg_load_per_task의 두 배 보다 클 경우에 group_imb를 set한다. 이 값은 group의 load가 imbalance하다는 것을 뜻한다.
13. group_capacity를 계산한다. 이 값은 group의cpu_power를 SCHED_LOAD_SCALE로 나눠서 계산한다.

'Enginius > Linux' 카테고리의 다른 글

linux-2.6.24에서 linux-2.6.32로 porting하기 (dec_load(), ) (0)	2011.08.18
CFS(completely fair scheduler) vs DWRR(distributed weighted round robin) (0)	2011.08.12
Process diagram in Linux (0)	2011.08.02
리눅스 커널 실시간 스케줄링 우선순위 (0)	2011.08.01
리눅스 커널 스케쥴링 영역과 클래스 (0)	2011.08.01

Mad for Simplicity

Load balance in Linux source level bottom-up analysis

'Enginius > Linux' 카테고리의 다른 글

티스토리툴바

Load balance in Linux source level bottom-up analysis

'Enginius > Linux' 카테고리의 다른 글

'Enginius/Linux' Related Articles

티스토리툴바