Skip to content

Commit ae10b2b

Browse files
jbaron-lgtmtorvalds
authored andcommitted
epoll: optimize EPOLL_CTL_DEL using rcu
Nathan Zimmer found that once we get over 10+ cpus, the scalability of SPECjbb falls over due to the contention on the global 'epmutex', which is taken in on EPOLL_CTL_ADD and EPOLL_CTL_DEL operations. Patch #1 removes the 'epmutex' lock completely from the EPOLL_CTL_DEL path by using rcu to guard against any concurrent traversals. Patch #2 remove the 'epmutex' lock from EPOLL_CTL_ADD operations for simple topologies. IE when adding a link from an epoll file descriptor to a wakeup source, where the epoll file descriptor is not nested. This patch (of 2): Optimize EPOLL_CTL_DEL such that it does not require the 'epmutex' by converting the file->f_ep_links list into an rcu one. In this way, we can traverse the epoll network on the add path in parallel with deletes. Since deletes can't create loops or worse wakeup paths, this is safe. This patch in combination with the patch "epoll: Do not take global 'epmutex' for simple topologies", shows a dramatic performance improvement in scalability for SPECjbb. Signed-off-by: Jason Baron <[email protected]> Tested-by: Nathan Zimmer <[email protected]> Cc: Eric Wong <[email protected]> Cc: Nelson Elhage <[email protected]> Cc: Al Viro <[email protected]> Cc: Davide Libenzi <[email protected]> Cc: "Paul E. McKenney" <[email protected]> CC: Wu Fengguang <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 823b794 commit ae10b2b

File tree

1 file changed

+32
-24
lines changed

1 file changed

+32
-24
lines changed

fs/eventpoll.c

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/proc_fs.h>
4242
#include <linux/seq_file.h>
4343
#include <linux/compat.h>
44+
#include <linux/rculist.h>
4445

4546
/*
4647
* LOCKING:
@@ -133,8 +134,12 @@ struct nested_calls {
133134
* of these on a server and we do not want this to take another cache line.
134135
*/
135136
struct epitem {
136-
/* RB tree node used to link this structure to the eventpoll RB tree */
137-
struct rb_node rbn;
137+
union {
138+
/* RB tree node links this structure to the eventpoll RB tree */
139+
struct rb_node rbn;
140+
/* Used to free the struct epitem */
141+
struct rcu_head rcu;
142+
};
138143

139144
/* List header used to link this structure to the eventpoll ready list */
140145
struct list_head rdllink;
@@ -671,6 +676,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
671676
return error;
672677
}
673678

679+
static void epi_rcu_free(struct rcu_head *head)
680+
{
681+
struct epitem *epi = container_of(head, struct epitem, rcu);
682+
kmem_cache_free(epi_cache, epi);
683+
}
684+
674685
/*
675686
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
676687
* all the associated resources. Must be called with "mtx" held.
@@ -692,8 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
692703

693704
/* Remove the current item from the list of epoll hooks */
694705
spin_lock(&file->f_lock);
695-
if (ep_is_linked(&epi->fllink))
696-
list_del_init(&epi->fllink);
706+
list_del_rcu(&epi->fllink);
697707
spin_unlock(&file->f_lock);
698708

699709
rb_erase(&epi->rbn, &ep->rbr);
@@ -704,9 +714,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
704714
spin_unlock_irqrestore(&ep->lock, flags);
705715

706716
wakeup_source_unregister(ep_wakeup_source(epi));
707-
708-
/* At this point it is safe to free the eventpoll item */
709-
kmem_cache_free(epi_cache, epi);
717+
/*
718+
* At this point it is safe to free the eventpoll item. Use the union
719+
* field epi->rcu, since we are trying to minimize the size of
720+
* 'struct epitem'. The 'rbn' field is no longer in use. Protected by
721+
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
722+
* use of the rbn field.
723+
*/
724+
call_rcu(&epi->rcu, epi_rcu_free);
710725

711726
atomic_long_dec(&ep->user->epoll_watches);
712727

@@ -872,7 +887,6 @@ static const struct file_operations eventpoll_fops = {
872887
*/
873888
void eventpoll_release_file(struct file *file)
874889
{
875-
struct list_head *lsthead = &file->f_ep_links;
876890
struct eventpoll *ep;
877891
struct epitem *epi;
878892

@@ -890,17 +904,12 @@ void eventpoll_release_file(struct file *file)
890904
* Besides, ep_remove() acquires the lock, so we can't hold it here.
891905
*/
892906
mutex_lock(&epmutex);
893-
894-
while (!list_empty(lsthead)) {
895-
epi = list_first_entry(lsthead, struct epitem, fllink);
896-
907+
list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
897908
ep = epi->ep;
898-
list_del_init(&epi->fllink);
899909
mutex_lock_nested(&ep->mtx, 0);
900910
ep_remove(ep, epi);
901911
mutex_unlock(&ep->mtx);
902912
}
903-
904913
mutex_unlock(&epmutex);
905914
}
906915

@@ -1138,7 +1147,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
11381147
struct file *child_file;
11391148
struct epitem *epi;
11401149

1141-
list_for_each_entry(epi, &file->f_ep_links, fllink) {
1150+
/* CTL_DEL can remove links here, but that can't increase our count */
1151+
rcu_read_lock();
1152+
list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
11421153
child_file = epi->ep->file;
11431154
if (is_file_epoll(child_file)) {
11441155
if (list_empty(&child_file->f_ep_links)) {
@@ -1160,6 +1171,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
11601171
"file is not an ep!\n");
11611172
}
11621173
}
1174+
rcu_read_unlock();
11631175
return error;
11641176
}
11651177

@@ -1286,7 +1298,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
12861298

12871299
/* Add the current item to the list of active epoll hook for this file */
12881300
spin_lock(&tfile->f_lock);
1289-
list_add_tail(&epi->fllink, &tfile->f_ep_links);
1301+
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
12901302
spin_unlock(&tfile->f_lock);
12911303

12921304
/*
@@ -1327,8 +1339,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
13271339

13281340
error_remove_epi:
13291341
spin_lock(&tfile->f_lock);
1330-
if (ep_is_linked(&epi->fllink))
1331-
list_del_init(&epi->fllink);
1342+
list_del_rcu(&epi->fllink);
13321343
spin_unlock(&tfile->f_lock);
13331344

13341345
rb_erase(&epi->rbn, &ep->rbr);
@@ -1844,15 +1855,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
18441855
* and hang them on the tfile_check_list, so we can check that we
18451856
* haven't created too many possible wakeup paths.
18461857
*
1847-
* We need to hold the epmutex across both ep_insert and ep_remove
1848-
* b/c we want to make sure we are looking at a coherent view of
1849-
* epoll network.
1858+
* We need to hold the epmutex across ep_insert to prevent
1859+
* multple adds from creating loops in parallel.
18501860
*/
1851-
if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1861+
if (op == EPOLL_CTL_ADD) {
18521862
mutex_lock(&epmutex);
18531863
did_lock_epmutex = 1;
1854-
}
1855-
if (op == EPOLL_CTL_ADD) {
18561864
if (is_file_epoll(tf.file)) {
18571865
error = -ELOOP;
18581866
if (ep_loop_check(ep, tf.file) != 0) {

0 commit comments

Comments
 (0)