SlideShare a Scribd company logo
pidfds
Process file descriptors on Linux
Christian Brauner
christian.brauner@ubuntu.com
@brau_ner
https://people.kernel.org/brauner
https://brauner.io
pidfd: what's that?
file descriptor referring to a process
specifically, an fd referring to a thread-group leader
stable, private handle
fd guarantees to reference the same process
pidfds use pre-existing stable process handle
reference struct pid, not task_struct
struct pid
{
refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
Why do this in the first place?
pid recycling
avoid pitfalls of pid recycling on high-pressure systems
CVE-2019-6133: https://bugs.chromium.org/p/project-zero/issues/detail?id=1692
CVE-2014-5033: https://www.cvedetails.com/cve/CVE-2014-5033/
pid-based mac exploits: https://objective-see.com/blog/blog_0x41.html
https://doc.qt.io/qt-5/qprocess.html#startDetached
https://marc.info/?l=openssl-dev&m=130289811108150&w=2
CVE-2017-13209: (Android - Hardware Service Manager Arbitrary Service Replacement due to getpidcon)
https://www.exploit-db.com/exploits/43513
Issue 851: Android: racy getpidcon usage permits binder service replacement
https://bugs.chromium.org/p/project-zero/issues/detail?id=851
Why do this in the first place?
shared libraries
allow to spawn invisible helper processes
process management delegation
hand of a handle to a non-parent process (e.g. for waiting, signaling)
ubiquity of fds
common patterns already exist everywhere in userspace
Does userspace really care about this feature?
dbus
https://gitlab.freedesktop.org/dbus/dbus/issues/274
qt
https://codereview.qt-project.org/c/qt/qtbase/+/108456
systemd
https://github.com/systemd/systemd/issues/13101
criu
https://github.com/checkpoint-restore/criu/issues/717
lmkd
https://android-review.googlesource.com/c/platform/system/core/+/1088157
bpftrace
https://github.com/iovisor/bpftrace/issues/880
mio
https://github.com/samuelbrian/mio-pidfd
Prior art
Illumos
pure userspace emulation of stable process handle
procopen(), procrun(), procclose(), procfree(), etc.
OpenBSD, NetBSD
no private, stable process handles
FreeBSD
procdesc: pdfork(), pdgetpid(), pdkill()
Linux
forkfd(), CLONE_FD
Building a new api
4 kernel releases
individual elements to create a complete api
5.1
sending signals
using pidfds to reliably send signals
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
/* Enforce flags be set to 0 until we add an extension. */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
/* Is this a pidfd? */
pid = pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
}
5.2
CLONE_PIDFD
create pidfds at process creation time
O_CLOEXEC
pidfds are close-on-exec by default
/proc/<pid>/fd/fdinfo
contains pid of process in procfs pidns
/*
* This has to happen after we've potentially unshared the file
* descriptor table (so that the pidfd doesn't leak into the child
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
O_RDWR | O_CLOEXEC);
if (IS_ERR(pidfile)) {
put_unused_fd(pidfd);
retval = PTR_ERR(pidfile);
goto bad_fork_free_pid;
}
get_pid(pid); /* held by pidfile now */
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}
5.3
polling support
exit notification for non-parents
static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
/*
* Poll support for process exit notification.
*/
static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct task_struct *task;
struct pid *pid = file->private_data;
int poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
if (!task || (task->exit_state && thread_group_empty(task)))
poll_flags = POLLIN | POLLRDNORM;
rcu_read_unlock();
return poll_flags;
}
5.3
pidfds without CLONE_PIDFD
pidfd_open() to create pidfd
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd, ret;
struct pid *p;
if (flags)
return -EINVAL;
if (pid <= 0)
return -EINVAL;
p = find_get_pid(pid);
if (!p)
return -ESRCH;
ret = 0;
rcu_read_lock();
if (!pid_task(p, PIDTYPE_TGID))
ret = -EINVAL;
rcu_read_unlock();
fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
5.4 (proposed)
waiting through pidfds
P_PIDFD for waitid()
case P_PIDFD:
type = PIDTYPE_PID;
if (upid < 0)
return -EINVAL;
pid = pidfd_get_pid(upid);
if (IS_ERR(pid))
return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
wo.wo_rusage = ru;
ret = do_wait(&wo);
Kill-on-close
SIGKILL on last close
kill process when last fd referencing it is closed
exclusive waiting
CLONE_WAIT_PID
hide process from generic wait requests (e.g. waitid(P_ALL)
Lessons learned
speed matters
choose a sustainable speed for developing features
be open about being "dumb"
it's ok to say "I don't know" or "I can't review that"
be resilient
reviews are a form of critique

More Related Content

PDF
Introduction to segmentation fault handling
PDF
Chromium Sandbox on Linux (NDC Security 2019)
PDF
Linux Security APIs and the Chromium Sandbox
PDF
breed_python_tx_redacted
PDF
ch6-pv2-device-drivers
PDF
Systems Programming Assignment Help - Processes
PPTX
Process management
Introduction to segmentation fault handling
Chromium Sandbox on Linux (NDC Security 2019)
Linux Security APIs and the Chromium Sandbox
breed_python_tx_redacted
ch6-pv2-device-drivers
Systems Programming Assignment Help - Processes
Process management

Similar to Kernel Recipes 2019 - pidfds: Process file descriptors on Linux (20)

PPT
1. Von Neumann + Booting Sequence + System Calls.ppt
PPTX
OS presentation (1).pptx
PPT
Threads Advance in System Administration with Linux
PPT
Unit 6
PPT
What is-a-computer-process-os
PPTX
UNIX_Process Control_Module3.pptx
PPTX
Linux IO
PPT
Unit 5
ODP
Sysprog 11
PDF
Operating Systems 1 (7/12) - Threads
PDF
Linux Internals - Part II
PPT
Processes in Linux.ppt
PPT
signal power point presentation it is useful
PPT
process creation OS
PPTX
Systemcall1
PPTX
Process management in operating system | process states | PCB | FORK() | Zomb...
PPTX
Lecture_Slide_4.pptx
PPT
Processes this has stuff about processes and deifntions.ppt
PPTX
operating system module 2 presentation notes
PPT
Process and Threads in Linux - PPT
1. Von Neumann + Booting Sequence + System Calls.ppt
OS presentation (1).pptx
Threads Advance in System Administration with Linux
Unit 6
What is-a-computer-process-os
UNIX_Process Control_Module3.pptx
Linux IO
Unit 5
Sysprog 11
Operating Systems 1 (7/12) - Threads
Linux Internals - Part II
Processes in Linux.ppt
signal power point presentation it is useful
process creation OS
Systemcall1
Process management in operating system | process states | PCB | FORK() | Zomb...
Lecture_Slide_4.pptx
Processes this has stuff about processes and deifntions.ppt
operating system module 2 presentation notes
Process and Threads in Linux - PPT
Ad

More from Anne Nicolas (20)

PDF
Kernel Recipes 2019 - Driving the industry toward upstream first
PDF
Kernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMI
PDF
Kernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernel
PDF
Kernel Recipes 2019 - Metrics are money
PDF
Kernel Recipes 2019 - Kernel documentation: past, present, and future
PDF
Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...
PDF
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
PDF
Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...
PDF
Embedded Recipes 2019 - Remote update adventures with RAUC, Yocto and Barebox
PDF
Embedded Recipes 2019 - Making embedded graphics less special
PDF
Embedded Recipes 2019 - Linux on Open Source Hardware and Libre Silicon
PDF
Embedded Recipes 2019 - From maintaining I2C to the big (embedded) picture
PDF
Embedded Recipes 2019 - Testing firmware the devops way
PDF
Embedded Recipes 2019 - Herd your socs become a matchmaker
PDF
Embedded Recipes 2019 - LLVM / Clang integration
PDF
Embedded Recipes 2019 - Introduction to JTAG debugging
PDF
Embedded Recipes 2019 - Pipewire a new foundation for embedded multimedia
PDF
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
PDF
Kernel Recipes 2019 - Suricata and XDP
PDF
Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)
Kernel Recipes 2019 - Driving the industry toward upstream first
Kernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMI
Kernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernel
Kernel Recipes 2019 - Metrics are money
Kernel Recipes 2019 - Kernel documentation: past, present, and future
Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...
Embedded Recipes 2019 - Remote update adventures with RAUC, Yocto and Barebox
Embedded Recipes 2019 - Making embedded graphics less special
Embedded Recipes 2019 - Linux on Open Source Hardware and Libre Silicon
Embedded Recipes 2019 - From maintaining I2C to the big (embedded) picture
Embedded Recipes 2019 - Testing firmware the devops way
Embedded Recipes 2019 - Herd your socs become a matchmaker
Embedded Recipes 2019 - LLVM / Clang integration
Embedded Recipes 2019 - Introduction to JTAG debugging
Embedded Recipes 2019 - Pipewire a new foundation for embedded multimedia
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - Suricata and XDP
Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)
Ad

Recently uploaded (20)

PPTX
Reimagine Home Health with the Power of Agentic AI​
PPTX
Operating system designcfffgfgggggggvggggggggg
PDF
How to Choose the Right IT Partner for Your Business in Malaysia
PDF
System and Network Administration Chapter 2
PDF
SAP S4 Hana Brochure 3 (PTS SYSTEMS AND SOLUTIONS)
PDF
Which alternative to Crystal Reports is best for small or large businesses.pdf
PDF
PTS Company Brochure 2025 (1).pdf.......
PDF
Upgrade and Innovation Strategies for SAP ERP Customers
PDF
Raksha Bandhan Grocery Pricing Trends in India 2025.pdf
PPTX
history of c programming in notes for students .pptx
PDF
2025 Textile ERP Trends: SAP, Odoo & Oracle
PDF
How to Migrate SBCGlobal Email to Yahoo Easily
PPTX
Essential Infomation Tech presentation.pptx
PDF
Digital Strategies for Manufacturing Companies
PDF
Flood Susceptibility Mapping Using Image-Based 2D-CNN Deep Learnin. Overview ...
PPTX
Transform Your Business with a Software ERP System
PDF
Understanding Forklifts - TECH EHS Solution
PDF
Claude Code: Everyone is a 10x Developer - A Comprehensive AI-Powered CLI Tool
PPTX
ai tools demonstartion for schools and inter college
PDF
EN-Survey-Report-SAP-LeanIX-EA-Insights-2025.pdf
Reimagine Home Health with the Power of Agentic AI​
Operating system designcfffgfgggggggvggggggggg
How to Choose the Right IT Partner for Your Business in Malaysia
System and Network Administration Chapter 2
SAP S4 Hana Brochure 3 (PTS SYSTEMS AND SOLUTIONS)
Which alternative to Crystal Reports is best for small or large businesses.pdf
PTS Company Brochure 2025 (1).pdf.......
Upgrade and Innovation Strategies for SAP ERP Customers
Raksha Bandhan Grocery Pricing Trends in India 2025.pdf
history of c programming in notes for students .pptx
2025 Textile ERP Trends: SAP, Odoo & Oracle
How to Migrate SBCGlobal Email to Yahoo Easily
Essential Infomation Tech presentation.pptx
Digital Strategies for Manufacturing Companies
Flood Susceptibility Mapping Using Image-Based 2D-CNN Deep Learnin. Overview ...
Transform Your Business with a Software ERP System
Understanding Forklifts - TECH EHS Solution
Claude Code: Everyone is a 10x Developer - A Comprehensive AI-Powered CLI Tool
ai tools demonstartion for schools and inter college
EN-Survey-Report-SAP-LeanIX-EA-Insights-2025.pdf

Kernel Recipes 2019 - pidfds: Process file descriptors on Linux

  • 1. pidfds Process file descriptors on Linux Christian Brauner christian.brauner@ubuntu.com @brau_ner https://people.kernel.org/brauner https://brauner.io
  • 2. pidfd: what's that? file descriptor referring to a process specifically, an fd referring to a thread-group leader stable, private handle fd guarantees to reference the same process pidfds use pre-existing stable process handle reference struct pid, not task_struct struct pid { refcount_t count; unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; };
  • 3. Why do this in the first place? pid recycling avoid pitfalls of pid recycling on high-pressure systems CVE-2019-6133: https://bugs.chromium.org/p/project-zero/issues/detail?id=1692 CVE-2014-5033: https://www.cvedetails.com/cve/CVE-2014-5033/ pid-based mac exploits: https://objective-see.com/blog/blog_0x41.html https://doc.qt.io/qt-5/qprocess.html#startDetached https://marc.info/?l=openssl-dev&m=130289811108150&w=2 CVE-2017-13209: (Android - Hardware Service Manager Arbitrary Service Replacement due to getpidcon) https://www.exploit-db.com/exploits/43513 Issue 851: Android: racy getpidcon usage permits binder service replacement https://bugs.chromium.org/p/project-zero/issues/detail?id=851
  • 4. Why do this in the first place? shared libraries allow to spawn invisible helper processes process management delegation hand of a handle to a non-parent process (e.g. for waiting, signaling) ubiquity of fds common patterns already exist everywhere in userspace
  • 5. Does userspace really care about this feature? dbus https://gitlab.freedesktop.org/dbus/dbus/issues/274 qt https://codereview.qt-project.org/c/qt/qtbase/+/108456 systemd https://github.com/systemd/systemd/issues/13101 criu https://github.com/checkpoint-restore/criu/issues/717 lmkd https://android-review.googlesource.com/c/platform/system/core/+/1088157 bpftrace https://github.com/iovisor/bpftrace/issues/880 mio https://github.com/samuelbrian/mio-pidfd
  • 6. Prior art Illumos pure userspace emulation of stable process handle procopen(), procrun(), procclose(), procfree(), etc. OpenBSD, NetBSD no private, stable process handles FreeBSD procdesc: pdfork(), pdgetpid(), pdkill() Linux forkfd(), CLONE_FD
  • 7. Building a new api 4 kernel releases individual elements to create a complete api
  • 8. 5.1 sending signals using pidfds to reliably send signals SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { int ret; struct fd f; struct pid *pid; kernel_siginfo_t kinfo; /* Enforce flags be set to 0 until we add an extension. */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; }
  • 9. 5.2 CLONE_PIDFD create pidfds at process creation time O_CLOEXEC pidfds are close-on-exec by default /proc/<pid>/fd/fdinfo contains pid of process in procfs pidns /* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, O_RDWR | O_CLOEXEC); if (IS_ERR(pidfile)) { put_unused_fd(pidfd); retval = PTR_ERR(pidfile); goto bad_fork_free_pid; } get_pid(pid); /* held by pidfile now */ retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; }
  • 10. 5.3 polling support exit notification for non-parents static void do_notify_pidfd(struct task_struct *task) { struct pid *pid; WARN_ON(task->exit_state == 0); pid = task_pid(task); wake_up_all(&pid->wait_pidfd); } /* * Poll support for process exit notification. */ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; int poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) poll_flags = POLLIN | POLLRDNORM; rcu_read_unlock(); return poll_flags; }
  • 11. 5.3 pidfds without CLONE_PIDFD pidfd_open() to create pidfd SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd, ret; struct pid *p; if (flags) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; ret = 0; rcu_read_lock(); if (!pid_task(p, PIDTYPE_TGID)) ret = -EINVAL; rcu_read_unlock(); fd = ret ?: pidfd_create(p); put_pid(p); return fd; }
  • 12. 5.4 (proposed) waiting through pidfds P_PIDFD for waitid() case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; ret = do_wait(&wo);
  • 13. Kill-on-close SIGKILL on last close kill process when last fd referencing it is closed
  • 14. exclusive waiting CLONE_WAIT_PID hide process from generic wait requests (e.g. waitid(P_ALL)
  • 15. Lessons learned speed matters choose a sustainable speed for developing features be open about being "dumb" it's ok to say "I don't know" or "I can't review that" be resilient reviews are a form of critique