Linux

프로세스 생성 분석 fork (태스크 복사 및 초기화 부분) - 2

Jminu 2025. 9. 30. 17:33

fork가 호출되면 결국 kernel_clone이 호출된다는 사실을 알았다.

kernel_clone에서는 kernel_clone_args, pt_regs 등등 구조체를 많이 사용하는데,

어떤 구조체인지 알아보자.

 

(코드가 너무 긴건, linux 깃헙 링크로 대체한다)

 

일단 kernel_clone에서는 전달받은 kernel_clone_args를 다른 함수의 인자로 많이 넘긴다.

pid_t kernel_clone(struct kernel_clone_args *args)
{
	u64 clone_flags = args->flags;
	struct completion vfork;
	struct pid *pid;
	struct task_struct *p;
	int trace = 0;
	pid_t nr;

	if ((clone_flags & CLONE_PIDFD) &&
	    (clone_flags & CLONE_PARENT_SETTID) &&
	    (args->pidfd == args->parent_tid))
		return -EINVAL;

	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if (args->exit_signal != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
	add_latent_entropy();

	if (IS_ERR(p))
		return PTR_ERR(p);

	trace_sched_process_fork(current, p);

	pid = get_task_pid(p, PIDTYPE_PID);
	nr = pid_vnr(pid);

	if (clone_flags & CLONE_PARENT_SETTID)
		put_user(nr, args->parent_tid);

	if (clone_flags & CLONE_VFORK) {
		p->vfork_done = &vfork;
		init_completion(&vfork);
		get_task_struct(p);
	}

	if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
		/* lock the task to synchronize with memcg migration */
		task_lock(p);
		lru_gen_add_mm(p->mm);
		task_unlock(p);
	}

	wake_up_new_task(p);

	/* forking complete and child started to run, tell ptracer */
	if (unlikely(trace))
		ptrace_event_pid(trace, pid);

	if (clone_flags & CLONE_VFORK) {
		if (!wait_for_vfork_done(p, &vfork))
			ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
	}

	put_pid(pid);
	return nr;
}

kernel_clone에서는

27번줄 : p = copy_process(NULL, trace, NUMA_NO_NODE, args); 

이렇게 copy_process를 호출하는 것을 볼 수 있고 인자로 kernel_clone_args구조체 변수인 args를 전달한다.

 

잠깐 살펴보자면, kernel_clone_args구조체 정의는 이렇게 되어있다.

나중에 차근차근 살펴보자.

  struct kernel_clone_args {
          u64 flags;
          int __user *pidfd;
          int __user *child_tid;
          int __user *parent_tid;
          const char *name;
          int exit_signal;
          u32 kthread:1;
          u32 io_thread:1;
          u32 user_worker:1;
          u32 no_files:1;
          unsigned long stack;
          unsigned long stack_size;
          unsigned long tls;
          pid_t *set_tid;
          /* Number of elements in     *set_tid */
          size_t set_tid_size;
          int cgroup;
          int idle;
          int (*fn)(void *);
          void *fn_arg;
          struct cgroup *cgrp;
          struct css_set *cset;
          unsigned int kill_seq;
  };

 

kernel_clone_args구조체 필드에 대한 의미들은 일단 copy_process를 보고나면

더 감이 잡힐 것만 같다.

 

copy_process 정의를 보면

https://github.com/Jminu/linux/blob/master/kernel/fork.c

 

linux/kernel/fork.c at master · Jminu/linux

Linux kernel source tree. Contribute to Jminu/linux development by creating an account on GitHub.

github.com

copy_process에서

p = dup_task_struct(current, node);

dup_task_struct를 호출하며 인자에 current를 넣는다.

여기서 current는 task_struct 구조체 변수.

 

task_struct 구조체의 정의를 일단 봐보자.

 struct task_struct {
  #ifdef CONFIG_THREAD_INFO_IN_TASK       
          /*
           * For reasons of header soup (see current_thread_info()), this
           * must be the first element of task_struct.
           */
          struct thread_info              thread_info;
  #endif
          unsigned int                    __state;
  
          /* saved state for "spinlock sleepers" */
          unsigned int                    saved_state;

thread_info구조체를 필드로 가지고,

 

thread_info구조체는 아래와 같이 정의되어있다.

  struct thread_info {
          unsigned long           flags;          /* low level flags */
  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
          u64                     ttbr0;          /* saved TTBR0_EL1 */
  #endif
          union {
                  u64             preempt_count;  /* 0 => preemptible, <0 => bug */
                  struct {
  #ifdef CONFIG_CPU_BIG_ENDIAN
                          u32     need_resched;
                          u32     count;
  #else
                          u32     count;
                          u32     need_resched;
  #endif
                  } preempt;
          };
  #ifdef CONFIG_SHADOW_CALL_STACK
          void                    *scs_base;
          void                    *scs_sp;
  #endif
          u32                     cpu;
  };

참으로 많은 것들이 얽혀있다..!

차차 구조체를 살펴보도록하자..


방금 나왔던 dup_task_struct 정의를보자.

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
	struct task_struct *tsk;
	int err;

	if (node == NUMA_NO_NODE)
		node = tsk_fork_get_node(orig);
	tsk = alloc_task_struct_node(node);
	if (!tsk)
		return NULL;

	err = arch_dup_task_struct(tsk, orig);
	if (err)
		goto free_tsk;

	err = alloc_thread_stack_node(tsk, node);
	if (err)
		goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
	refcount_set(&tsk->stack_refcount, 1);
#endif
	account_kernel_stack(tsk, 1);

	err = scs_prepare(tsk, node);
	if (err)
		goto free_stack;

#ifdef CONFIG_SECCOMP
	/*
	 * We must handle setting up seccomp filters once we're under
	 * the sighand lock in case orig has changed between now and
	 * then. Until then, filter must be NULL to avoid messing up
	 * the usage counts on the error path calling free_task.
	 */
	tsk->seccomp.filter = NULL;
#endif

	setup_thread_stack(tsk, orig);
	clear_user_return_notifier(tsk);
	clear_tsk_need_resched(tsk);
	set_task_stack_end_magic(tsk);
	clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
	tsk->stack_canary = get_random_canary();
#endif
	if (orig->cpus_ptr == &orig->cpus_mask)
		tsk->cpus_ptr = &tsk->cpus_mask;
	dup_user_cpus_ptr(tsk, orig, node);

	/*
	 * One for the user space visible state that goes away when reaped.
	 * One for the scheduler.
	 */
	refcount_set(&tsk->rcu_users, 2);
	/* One for the rcu users */
	refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
	tsk->btrace_seq = 0;
#endif
	tsk->splice_pipe = NULL;
	tsk->task_frag.page = NULL;
	tsk->wake_q.next = NULL;
	tsk->worker_private = NULL;

	kcov_task_init(tsk);
	kmsan_task_create(tsk);
	kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
	tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
	tsk->throttle_disk = NULL;
	tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
	tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
	tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_CPU_SUP_INTEL
	tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
	tsk->mm_cid = -1;
	tsk->last_mm_cid = -1;
	tsk->mm_cid_active = 0;
	tsk->migrate_from_cpu = -1;
#endif
	return tsk;

free_stack:
	exit_task_stack_account(tsk);
	free_thread_stack(tsk);
free_tsk:
	free_task_struct(tsk);
	return NULL;
}

중요하다고 생각하는 함수만 살펴보자

 

12번줄 : err = arch_dup_task_struct(tsk, orig);

  • tsk : 새로운 태스크
  • orig : copy_process에서 호출한 dup_task_struct(current, node)의 인자 current => 즉, 현재 태스크

기존 태스크 orig의 스레드 정보를 새로운 태스크 tsk의 스레드 정보에 채운다.

 

39번줄 : setup_thread_stack(tsk, orig);

  • #define setup_thread_stack(new,old) do { } while(0) 이렇게 정의됨
  • 아무일도 하지않고 넘어감 왜??? 모르겠음

42번줄 : set_task_stack_end_magic(tsk);

 void set_task_stack_end_magic(struct task_struct *tsk)
 {
         unsigned long *stackend;
 
         stackend = end_of_stack(tsk);
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
 }

에서 end_of_stack(tsk)를 호출

 

end_of_stack정의

    static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
  {
  #ifdef CONFIG_STACK_GROWSUP
          return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
  #else
          return task->stack;
  #endif
  }

새로운 테스크(task_struct 구조체 변수)의 stack필드에서 THREAD_SIZE를 더한다음 -1 한다.

즉, 스택주소의 끝을 반환함. 그래서 end_of_stack인듯하다.

 

그다음, *stackend = STACK_END_MAGIC에서

STACK_END_MAGIC은

#define STACK_END_MAGIC         0x57AC6E9D
이렇게 정의되어 있다.

스택의 끝을 표시한다. -> 나중에 stack overflow생겼는지 감지한다.

 

요약하면 p = dup_task_struct(current, node);

현재의 task_struct를 새로운 태스크의 task_struct에 복사

 

copy_process의 나머지 부분

/* Perform scheduler related setup. Assign this task to a CPU. */
	retval = sched_fork(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_policy;

	retval = perf_event_init_task(p, clone_flags);
	if (retval)
		goto bad_fork_sched_cancel_fork;
	retval = audit_alloc(p);
	if (retval)
		goto bad_fork_cleanup_perf;
	/* copy all the process information */
	shm_init_task(p);
	retval = security_task_alloc(p, clone_flags);
	if (retval)
		goto bad_fork_cleanup_audit;
	retval = copy_semundo(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_security;
	retval = copy_files(clone_flags, p, args->no_files);
	if (retval)
		goto bad_fork_cleanup_semundo;
	retval = copy_fs(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_files;
	retval = copy_sighand(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_fs;
	retval = copy_signal(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_sighand;
	retval = copy_mm(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_signal;
	retval = copy_namespaces(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_mm;
	retval = copy_io(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_namespaces;
	retval = copy_thread(p, args);
	if (retval)
		goto bad_fork_cleanup_io;

이런방식으로 부모의 정보를 자식에게 쭉 복사하는걸 알 수 있음.

그리고 마지막에 p를 반환하는데, 여기서 p는 복사완료된 task_struct임 (새로 생성된 프로세스)

---- 여기까지 copy_process 분석 끝.----

 

다시 kernel_clone을 보자.

static int debug_kernel_thread = 1;
pid_t kernel_clone(struct kernel_clone_args *args)
{
	u64 clone_flags = args->flags;
	struct completion vfork;
	struct pid *pid;
	struct task_struct *p;
	int trace = 0;
	pid_t nr;

	/*
	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
	 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
	 * field in struct clone_args and it still doesn't make sense to have
	 * them both point at the same memory location. Performing this check
	 * here has the advantage that we don't need to have a separate helper
	 * to check for legacy clone().
	 */
	if ((clone_flags & CLONE_PIDFD) &&
	    (clone_flags & CLONE_PARENT_SETTID) &&
	    (args->pidfd == args->parent_tid))
		return -EINVAL;

	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if (args->exit_signal != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
	add_latent_entropy();

	if (debug_kernel_thread) {
		printk("[+][%s] process n ", current->comm);
		dump_stack();
	}

	if (IS_ERR(p))
		return PTR_ERR(p);

	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	trace_sched_process_fork(current, p);

	pid = get_task_pid(p, PIDTYPE_PID);
	nr = pid_vnr(pid);

	if (clone_flags & CLONE_PARENT_SETTID)
		put_user(nr, args->parent_tid);

	if (clone_flags & CLONE_VFORK) {
		p->vfork_done = &vfork;
		init_completion(&vfork);
		get_task_struct(p);
	}

	if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
		/* lock the task to synchronize with memcg migration */
		task_lock(p);
		lru_gen_add_mm(p->mm);
		task_unlock(p);
	}

	wake_up_new_task(p);

	/* forking complete and child started to run, tell ptracer */
	if (unlikely(trace))
		ptrace_event_pid(trace, pid);

	if (clone_flags & CLONE_VFORK) {
		if (!wait_for_vfork_done(p, &vfork))
			ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
	}

	put_pid(pid);
	return nr;
}

79번줄 : wake_up_new_task(p);

  • p는 계속 말하듯이 생성한 프로세스를 깨운다.

wake_up_new_task 정의

void wake_up_new_task(struct task_struct *p)
{
	struct rq_flags rf;
	struct rq *rq;
	int wake_flags = WF_FORK;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_ptr can change in the fork path
	 *  - any previously selected CPU might disappear through hotplug
	 *
	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
	 * as we're not fully set-up yet.
	 */
	p->recent_used_cpu = task_cpu(p);
	rseq_migrate(p);
	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
#endif
	rq = __task_rq_lock(p, &rf);
	update_rq_clock(rq);
	post_init_entity_util_avg(p);

	activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
	trace_sched_wakeup_new(p);
	wakeup_preempt(rq, p, wake_flags);
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken) {
		/*
		 * Nothing relies on rq->lock after this, so it's fine to
		 * drop it.
		 */
		rq_unpin_lock(rq, &rf);
		p->sched_class->task_woken(rq, p);
		rq_repin_lock(rq, &rf);
	}
#endif
	task_rq_unlock(rq, p, &rf);
}

8번줄 : WRITE_ONCE(p->__state, TASK_RUNNING);

생성한 프로세스를 RUNNING상태로 바꾼다.

 

분석 끝.

요약하면, kernel_clone 발생 -> copy_process에서 부모의 task_struct를 쭉 복사 및 초기화 -> kernel_clone으로 복귀해서 wake_up_new_task호출하여 생성한 프로세스를 러닝상태로 변경 -> nr이라는 pid를 반환한다.