struct pid & pid_namespace
struct pid & pid_namespace
alloc_pid() & task_struct插入pid struct tasks[] hash list
fork進程/線程時,copy_process()會給此線程alloc一個struct pid結(jié)構(gòu)體。當(dāng)是fork進程/線程時,copy_process()的pid參數(shù)將是null,所以會call alloc_pid()
static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; } }
看下alloc_pid干了些啥。首先它會alloc一個pid struct,然后設(shè)置這個pid struct:
調(diào)用idr_alloc_cyclic(),這個函數(shù)的返回值就是當(dāng)前fork線程的pid;
設(shè)置pid里numbers成員(nr和ns)
ns->level次數(shù)的for循環(huán),這個對于沒有開CONFIG_PID_NS時,pid namespace將只有一個level,所以ns->level都會是0,所以此時只有有一次循環(huán),此時將只會設(shè)置pid numbers[0],0 index即是全局的pid,在整個系統(tǒng)中唯一;
如果開啟了CONFIG_PID_NS,此時ns->level將有可能不是0,此時pid->members[0]是全局的upid,其它pid->numbers[1]則是numbers[0]的child namespace,pid->numbers[2]等依次類推。
alloc_pid()的參數(shù)ns在沒有開啟CONFIG_PID_NS的情況下,都是一樣的,即指向init_pid_ns
設(shè)置完pid struct后,調(diào)用idr_replace將此pid struct和alloc的pid作為一對mapping值保存起來:
struct pid *alloc_pid(struct pid_namespace *ns) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int pid_min = 1; idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); /* * init really needs pid 1, but after reaching the maximum * wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; } get_pid_ns(ns); atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid;
alloc_pid()后,會設(shè)置當(dāng)前fork的task_struct的pid成員,此pid成員就是當(dāng)前fork出的線程的pid,這個pid數(shù)值即是上面alloc_pid()里分配的pid結(jié)構(gòu)體里的numbers[0].nr,即系統(tǒng)全局的線程的pid,具有唯一性
static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; return nr; }
接下來則會將當(dāng)前fork的task_struct和上面alloc的pid struct關(guān)聯(lián)起來。如果當(dāng)前fork的線程是進程的主線程(thread group leader),則會將主線程鏈接到上面alloc給它的struct pid的tasks[PIDTYPE_PID] & tasks[PIDTYPE_TGID] hash list上,以及將它鏈接到其父進程所鏈接到的tasks[PGID]和tasks[PIDTYPE_SID] hash list上;
如果不是主線程,則只會將此task_struct插入上面給它alloc的pid struct的tasks[PIDTYPE_PID] hash list。
copy_process() init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same * tasklist_lock with adding child to the process tree * for propagate_has_child_subreaper optimization. */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; }
setpgid創(chuàng)建進程組或者遷移某個進程到另外一個進程組
1. setpgid創(chuàng)建新的進程組
此時setpgid(pid_t pid, pid_t pgid) pid參數(shù)和pgid參數(shù)要相等,并且此pid要是thread group leader,比如user space調(diào)用setpgid(getpid(), getpid())或者setpgid(0, 0)或者setpgid(getpid(), 0)。此后此進程將脫離其父進程所在的進程組,自己創(chuàng)建了一個獨立的進程組。
2. setpgid()遷移一個進程到另外的進程組
此時pgid參數(shù)不能為0,setpgid()的pgid參數(shù)是另外一個進程組的組長進程的pid,同時要遷移的進程所在的進程組和要遷往的進程組要在同一個session里,此后要遷移的進程將遷入目標進程組,其task_struct將鏈接到目標進程組組長進程的pid struct的tasks[PIDTYPE_PGID] hash list
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; struct pid *pgrp; int err; if (!pid) pid = task_pid_vnr(group_leader); if (!pgid) pgid = pid; if (pgid < 0) return -EINVAL; rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ write_lock_irq(&tasklist_lock); err = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; err = -EINVAL; if (!thread_group_leader(p)) goto out; if (same_thread_group(p->real_parent, group_leader)) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; if (p != group_leader) goto out; } err = -EPERM; if (p->signal->leader) goto out; pgrp = task_pid(p); if (pgid != pid) { struct task_struct *g; pgrp = find_vpid(pgid); g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out; } err = security_task_setpgid(p, pgid); if (err) goto out; if (task_pgrp(p) != pgrp) change_pid(p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); return err; }
進程的task_struct所插入的pid struct tasks[] hash list
1. 如果進程沒有調(diào)用setpgid系統(tǒng)調(diào)用,并且其父進程也沒有執(zhí)行此系統(tǒng)調(diào)用,則其鏈接關(guān)系如下圖,task_struct通過其pid_links[PIDTYPE_PID]/[PIDTYPE_TGID]插入它自己的struct pid的tasks[PIDTYPE_PID]/[PIDTYPE_TGID] hash list,其它pid_links[PIDTYPE_PGID]/[PIDTYPE_SID]應(yīng)該是插入了init_struct_pid的tasks[PIDTYPE_PGID]/[PIDTYPE_SID] hash list:

2. 如果進程有執(zhí)行setpgid創(chuàng)建了進程組,則pid_links[]的鏈接關(guān)系如下圖。
Struct pid是某個進程fork時分配的,后面通過setpgid(0,0)創(chuàng)建一個進程組,首先將自己的task_struct通過pid_links[PIDTYPE_PGID]鏈接到自己pid struct的tasks[PIDTYPE_PGID] hash list上。后面此進程創(chuàng)建子進程時子進程也都會類似這樣將其task_struct鏈入此pid struct的tasks[PIDTYPE_PGID] hash list上,這樣同一個進程組中的所有進程將會被鏈接到組長進程的pid struct的tasks[PIDTYPE_PGID] hash list上:

(1)進程組struct pid tasks[] hash list鏈接關(guān)系
* 進程組中的成員進程是以進程的主線程的task_struct/struct pid來表示
從上述兩圖可以看出,對于主線程,線程自己的pid struct里的tasks[PIDTYPE_PID]/[PIDTYPE_TGID] hash list長度只有1,即只有一個list node,即為自己本身的task_struct.pid_links[PIDTYPE_PID]/[PIDTYPE_TGID]。
3. 非主線程的struct pid.tasks[] hash list鏈接關(guān)系

如果是非主線程,則只會用到一個hash list,即tasks[PIDTYPE_PID] hash list,并且此hash list也只有一個node,即此非主線程的task_struct.pid_links[PIDTYPE_PID],同事沒有和所在進程內(nèi)的其它線程以及其它進程有鏈接關(guān)系,所以非主線程的struct pid.tasks[]鏈接關(guān)系很簡單
注意:
1. 不管是主線程還是非主線程,如果屬于user space的,均會給它alloc一個struct pid;
2. 不管是主線程和非主線程,因為struct pid.task[PIDTYPE_PID] hash list上只有一個node,所以find_task_by_vpid()在tasks[PIDTYPE_PID] hash list上取第一個node就得到了pid_t對應(yīng)的task_struct
CONFIG_PID_NS開啟條件下的多級pid_namespace

上述level 1是level 2的parent;level 0是level 1的parent.
一個level 2的線程fork時,會從level 2開始alloc pid,一直到level 0,所以這里它會alloc 3個pid,即會alloc3個pid namespace的pid number。
level 0是全局的,在通過pid_nr()設(shè)置task_struct pid_t成員時,其就是取的level 0 pid_namespace的pid number。
常用pid struct相關(guān)API
-
static inline pid_t task_pid_vnr(struct task_struct *tsk):根據(jù)task_struct得到對應(yīng)的pid
-
struct task_struct *find_task_by_vpid(pid_t vnr):根據(jù)pid num得到對應(yīng)的task_struct
浙公網(wǎng)安備 33010602011771號