1 files changed, 475 insertions, 0 deletions
diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch
new file mode 100644
index 00000000..434e2eb0
--- /dev/null
+++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch
@@ -0,0 +1,475 @@
+From 025c3f2ad21385eddb5bec1f742c1cdb6164ca30 Mon Sep 17 00:00:00 2001
+From: Jason Baron <jbaron@redhat.com>
+Date: Thu, 12 Jan 2012 17:17:43 -0800
+Subject: [PATCH 70/72] epoll: limit paths
+commit 28d82dc1c4edbc352129f97f4ca22624d1fe61de upstream.
+The current epoll code can be tickled to run basically indefinitely in
+both loop detection path check (on ep_insert()), and in the wakeup paths.
+The programs that tickle this behavior set up deeply linked networks of
+epoll file descriptors that cause the epoll algorithms to traverse them
+indefinitely.  A couple of these sample programs have been previously
+posted in this thread: https://lkml.org/lkml/2011/2/25/297.
+To fix the loop detection path check algorithms, I simply keep track of
+the epoll nodes that have been already visited.  Thus, the loop detection
+becomes proportional to the number of epoll file descriptor and links.
+This dramatically decreases the run-time of the loop check algorithm.  In
+one diabolical case I tried it reduced the run-time from 15 mintues (all
+in kernel time) to .3 seconds.
+Fixing the wakeup paths could be done at wakeup time in a similar manner
+by keeping track of nodes that have already been visited, but the
+complexity is harder, since there can be multiple wakeups on different
+cpus...Thus, I've opted to limit the number of possible wakeup paths when
+the paths are created.
+This is accomplished, by noting that the end file descriptor points that
+are found during the loop detection pass (from the newly added link), are
+actually the sources for wakeup events.  I keep a list of these file
+descriptors and limit the number and length of these paths that emanate
+from these 'source file descriptors'.  In the current implemetation I
+allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of
+length 4 and 10 of length 5.  Note that it is sufficient to check the
+'source file descriptors' reachable from the newly added link, since no
+other 'source file descriptors' will have newly added links.  This allows
+us to check only the wakeup paths that may have gotten too long, and not
+re-check all possible wakeup paths on the system.
+In terms of the path limit selection, I think its first worth noting that
+the most common case for epoll, is probably the model where you have 1
+epoll file descriptor that is monitoring n number of 'source file
+descriptors'.  In this case, each 'source file descriptor' has a 1 path of
+length 1.  Thus, I believe that the limits I'm proposing are quite
+reasonable and in fact may be too generous.  Thus, I'm hoping that the
+proposed limits will not prevent any workloads that currently work to
+fail.
+In terms of locking, I have extended the use of the 'epmutex' to all
+epoll_ctl add and remove operations.  Currently its only used in a subset
+of the add paths.  I need to hold the epmutex, so that we can correctly
+traverse a coherent graph, to check the number of paths.  I believe that
+this additional locking is probably ok, since its in the setup/teardown
+paths, and doesn't affect the running paths, but it certainly is going to
+add some extra overhead.  Also, worth noting is that the epmuex was
+recently added to the ep_ctl add operations in the initial path loop
+detection code using the argument that it was not on a critical path.
+Another thing to note here, is the length of epoll chains that is allowed.
+Currently, eventpoll.c defines:
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
+This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS
+ 1).  However, this limit is currently only enforced during the loop
+check detection code, and only when the epoll file descriptors are added
+in a certain order.  Thus, this limit is currently easily bypassed.  The
+newly added check for wakeup paths, stricly limits the wakeup paths to a
+length of 5, regardless of the order in which ep's are linked together.
+Thus, a side-effect of the new code is a more consistent enforcement of
+the graph depth.
+Thus far, I've tested this, using the sample programs previously
+mentioned, which now either return quickly or return -EINVAL.  I've also
+testing using the piptest.c epoll tester, which showed no difference in
+performance.  I've also created a number of different epoll networks and
+tested that they behave as expectded.
+I believe this solves the original diabolical test cases, while still
+preserving the sane epoll nesting.
+Signed-off-by: Jason Baron <jbaron@redhat.com>
+Cc: Nelson Elhage <nelhage@ksplice.com>
+Cc: Davide Libenzi <davidel@xmailserver.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c            |  234 ++++++++++++++++++++++++++++++++++++++++-----
+ include/linux/eventpoll.h |    1 +
+ include/linux/fs.h        |    1 +
+ 3 files changed, 211 insertions(+), 25 deletions(-)
+diff --git a/fs/eventpoll.c b/fs/eventpoll.c
+index 12a772b..ea54cde 100644
+--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
+@@ -197,6 +197,12 @@ struct eventpoll {
+ 
+        /* The user that created the eventpoll descriptor */
+        struct user_struct *user;
+
+       struct file *file;
+
+       /* used to optimize loop detection check */
+       int visited;
+       struct list_head visited_list_link;
+ };
+ 
+ /* Wait structure used by the poll hooks */
+@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
+ /* Slab cache used to allocate "struct eppoll_entry" */
+ static struct kmem_cache *pwq_cache __read_mostly;
+ 
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
+ #ifdef CONFIG_SYSCTL
+ 
+ #include <linux/sysctl.h>
+@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
+ };
+ #endif /* CONFIG_SYSCTL */
+ 
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+       return f->f_op == &eventpoll_fops;
+}
+ 
+ /* Setup the structure that is used as key for the RB tree */
+ static inline void ep_set_ffd(struct epoll_filefd *ffd,
+@@ -728,12 +749,6 @@ static const struct file_operations eventpoll_fops = {
+        .llseek         = noop_llseek,
+ };
+ 
+-/* Fast test to see if the file is an eventpoll file */
+-static inline int is_file_epoll(struct file *f)
+-{
+-       return f->f_op == &eventpoll_fops;
+-}
+-
+ /*
+  * This is called from eventpoll_release() to unlink files from the eventpoll
+  * interface. We need to have this facility to cleanup correctly files that are
+@@ -954,6 +969,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
+        rb_insert_color(&epi->rbn, &ep->rbr);
+ }
+ 
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+       if (++path_count[nests] > path_limits[nests])
+               return -1;
+       return 0;
+}
+
+static void path_count_init(void)
+{
+       int i;
+
+       for (i = 0; i < PATH_ARR_SIZE; i++)
+               path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+       int error = 0;
+       struct file *file = priv;
+       struct file *child_file;
+       struct epitem *epi;
+
+       list_for_each_entry(epi, &file->f_ep_links, fllink) {
+               child_file = epi->ep->file;
+               if (is_file_epoll(child_file)) {
+                       if (list_empty(&child_file->f_ep_links)) {
+                               if (path_count_inc(call_nests)) {
+                                       error = -1;
+                                       break;
+                               }
+                       } else {
+                               error = ep_call_nested(&poll_loop_ncalls,
+                                                       EP_MAX_NESTS,
+                                                       reverse_path_check_proc,
+                                                       child_file, child_file,
+                                                       current);
+                       }
+                       if (error != 0)
+                               break;
+               } else {
+                       printk(KERN_ERR "reverse_path_check_proc: "
+                               "file is not an ep!\n");
+               }
+       }
+       return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *         -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+       int length = 0;
+       int error = 0;
+       struct file *current_file;
+
+       /* let's call this for all tfiles */
+       list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+               length++;
+               path_count_init();
+               error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                       reverse_path_check_proc, current_file,
+                                       current_file, current);
+               if (error)
+                       break;
+       }
+       return error;
+}
+
+ /*
+  * Must be called with "mtx" held.
+  */
+@@ -1015,6 +1123,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+         */
+        ep_rbtree_insert(ep, epi);
+ 
+       /* now check if we've created too many backpaths */
+       error = -EINVAL;
+       if (reverse_path_check())
+               goto error_remove_epi;
+
+        /* We have to drop the new item inside our item list to keep track of it */
+        spin_lock_irqsave(&ep->lock, flags);
+ 
+@@ -1039,6 +1152,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+ 
+        return 0;
+ 
+error_remove_epi:
+       spin_lock(&tfile->f_lock);
+       if (ep_is_linked(&epi->fllink))
+               list_del_init(&epi->fllink);
+       spin_unlock(&tfile->f_lock);
+
+       rb_erase(&epi->rbn, &ep->rbr);
+
+ error_unregister:
+        ep_unregister_pollwait(ep, epi);
+ 
+@@ -1303,18 +1424,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+        int error = 0;
+        struct file *file = priv;
+        struct eventpoll *ep = file->private_data;
+       struct eventpoll *ep_tovisit;
+        struct rb_node *rbp;
+        struct epitem *epi;
+ 
+        mutex_lock_nested(&ep->mtx, call_nests + 1);
+       ep->visited = 1;
+       list_add(&ep->visited_list_link, &visited_list);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                epi = rb_entry(rbp, struct epitem, rbn);
+                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                       ep_tovisit = epi->ffd.file->private_data;
+                       if (ep_tovisit->visited)
+                               continue;
+                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+-                                              ep_loop_check_proc, epi->ffd.file,
+-                                              epi->ffd.file->private_data, current);
+                                       ep_loop_check_proc, epi->ffd.file,
+                                       ep_tovisit, current);
+                        if (error != 0)
+                                break;
+               } else {
+                       /*
+                        * If we've reached a file that is not associated with
+                        * an ep, then we need to check if the newly added
+                        * links are going to add too many wakeup paths. We do
+                        * this by adding it to the tfile_check_list, if it's
+                        * not already there, and calling reverse_path_check()
+                        * during ep_insert().
+                        */
+                       if (list_empty(&epi->ffd.file->f_tfile_llink))
+                               list_add(&epi->ffd.file->f_tfile_llink,
+                                        &tfile_check_list);
+                }
+        }
+        mutex_unlock(&ep->mtx);
+@@ -1335,8 +1474,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+  */
+ static int ep_loop_check(struct eventpoll *ep, struct file *file)
+ {
+-       return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+       int ret;
+       struct eventpoll *ep_cur, *ep_next;
+
+       ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                              ep_loop_check_proc, file, ep, current);
+       /* clear visited list */
+       list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+                                                       visited_list_link) {
+               ep_cur->visited = 0;
+               list_del(&ep_cur->visited_list_link);
+       }
+       return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+       struct file *file;
+
+       /* first clear the tfile_check_list */
+       while (!list_empty(&tfile_check_list)) {
+               file = list_first_entry(&tfile_check_list, struct file,
+                                       f_tfile_llink);
+               list_del_init(&file->f_tfile_llink);
+       }
+       INIT_LIST_HEAD(&tfile_check_list);
+ }
+ 
+ /*
+@@ -1344,8 +1506,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
+  */
+ SYSCALL_DEFINE1(epoll_create1, int, flags)
+ {
+-       int error;
+       int error, fd;
+        struct eventpoll *ep = NULL;
+       struct file *file;
+ 
+        /* Check the EPOLL_* constant for consistency.  */
+        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+@@ -1362,11 +1525,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
+         * Creates all the items needed to setup an eventpoll file. That is,
+         * a file structure and a free file descriptor.
+         */
+-       error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+       fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+       if (fd < 0) {
+               error = fd;
+               goto out_free_ep;
+       }
+       file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+                                 O_RDWR | (flags & O_CLOEXEC));
+-       if (error < 0)
+-               ep_free(ep);
+-
+       if (IS_ERR(file)) {
+               error = PTR_ERR(file);
+               goto out_free_fd;
+       }
+       fd_install(fd, file);
+       ep->file = file;
+       return fd;
+
+out_free_fd:
+       put_unused_fd(fd);
+out_free_ep:
+       ep_free(ep);
+        return error;
+ }
+ 
+@@ -1432,21 +1609,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+        /*
+         * When we insert an epoll file descriptor, inside another epoll file
+         * descriptor, there is the change of creating closed loops, which are
+-        * better be handled here, than in more critical paths.
+        * better be handled here, than in more critical paths. While we are
+        * checking for loops we also determine the list of files reachable
+        * and hang them on the tfile_check_list, so we can check that we
+        * haven't created too many possible wakeup paths.
+         *
+-        * We hold epmutex across the loop check and the insert in this case, in
+-        * order to prevent two separate inserts from racing and each doing the
+-        * insert "at the same time" such that ep_loop_check passes on both
+-        * before either one does the insert, thereby creating a cycle.
+        * We need to hold the epmutex across both ep_insert and ep_remove
+        * b/c we want to make sure we are looking at a coherent view of
+        * epoll network.
+         */
+-       if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+       if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
+                mutex_lock(&epmutex);
+                did_lock_epmutex = 1;
+-               error = -ELOOP;
+-               if (ep_loop_check(ep, tfile) != 0)
+-                       goto error_tgt_fput;
+        }
+-
+       if (op == EPOLL_CTL_ADD) {
+               if (is_file_epoll(tfile)) {
+                       error = -ELOOP;
+                       if (ep_loop_check(ep, tfile) != 0)
+                               goto error_tgt_fput;
+               } else
+                       list_add(&tfile->f_tfile_llink, &tfile_check_list);
+       }
+ 
+        mutex_lock_nested(&ep->mtx, 0);
+ 
+@@ -1465,6 +1648,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+                        error = ep_insert(ep, &epds, tfile, fd);
+                } else
+                        error = -EEXIST;
+               clear_tfile_check_list();
+                break;
+        case EPOLL_CTL_DEL:
+                if (epi)
+@@ -1483,7 +1667,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+        mutex_unlock(&ep->mtx);
+ 
+ error_tgt_fput:
+-       if (unlikely(did_lock_epmutex))
+       if (did_lock_epmutex)
+                mutex_unlock(&epmutex);
+ 
+        fput(tfile);
+diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
+index f362733..657ab55 100644
+--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
+@@ -61,6 +61,7 @@ struct file;
+ static inline void eventpoll_init_file(struct file *file)
+ {
+        INIT_LIST_HEAD(&file->f_ep_links);
+       INIT_LIST_HEAD(&file->f_tfile_llink);
+ }
+ 
+ 
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index e0bc4ff..10b2288 100644
+--- a/include/linux/fs.h
+++ b/include/linux/fs.h
+@@ -1001,6 +1001,7 @@ struct file {
+ #ifdef CONFIG_EPOLL
+        /* Used by fs/eventpoll.c to link all the hooks to this file */
+        struct list_head        f_ep_links;
+       struct list_head        f_tfile_llink;
+ #endif /* #ifdef CONFIG_EPOLL */
+        struct address_space    *f_mapping;
+ #ifdef CONFIG_DEBUG_WRITECOUNT
+-- 
+1.7.9.4

diff --git a/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch new file mode 100644 index 00000000..434e2eb0 --- /dev/null +++ b/recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.9/0070-epoll-limit-paths.patch
@@ -0,0 +1,475 @@
	1	From 025c3f2ad21385eddb5bec1f742c1cdb6164ca30 Mon Sep 17 00:00:00 2001
	2	From: Jason Baron <jbaron@redhat.com>
	3	Date: Thu, 12 Jan 2012 17:17:43 -0800
	4	Subject: [PATCH 70/72] epoll: limit paths
	5
	6	commit 28d82dc1c4edbc352129f97f4ca22624d1fe61de upstream.
	7
	8	The current epoll code can be tickled to run basically indefinitely in
	9	both loop detection path check (on ep_insert()), and in the wakeup paths.
	10	The programs that tickle this behavior set up deeply linked networks of
	11	epoll file descriptors that cause the epoll algorithms to traverse them
	12	indefinitely. A couple of these sample programs have been previously
	13	posted in this thread: https://lkml.org/lkml/2011/2/25/297.
	14
	15	To fix the loop detection path check algorithms, I simply keep track of
	16	the epoll nodes that have been already visited. Thus, the loop detection
	17	becomes proportional to the number of epoll file descriptor and links.
	18	This dramatically decreases the run-time of the loop check algorithm. In
	19	one diabolical case I tried it reduced the run-time from 15 mintues (all
	20	in kernel time) to .3 seconds.
	21
	22	Fixing the wakeup paths could be done at wakeup time in a similar manner
	23	by keeping track of nodes that have already been visited, but the
	24	complexity is harder, since there can be multiple wakeups on different
	25	cpus...Thus, I've opted to limit the number of possible wakeup paths when
	26	the paths are created.
	27
	28	This is accomplished, by noting that the end file descriptor points that
	29	are found during the loop detection pass (from the newly added link), are
	30	actually the sources for wakeup events. I keep a list of these file
	31	descriptors and limit the number and length of these paths that emanate
	32	from these 'source file descriptors'. In the current implemetation I
	33	allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of
	34	length 4 and 10 of length 5. Note that it is sufficient to check the
	35	'source file descriptors' reachable from the newly added link, since no
	36	other 'source file descriptors' will have newly added links. This allows
	37	us to check only the wakeup paths that may have gotten too long, and not
	38	re-check all possible wakeup paths on the system.
	39
	40	In terms of the path limit selection, I think its first worth noting that
	41	the most common case for epoll, is probably the model where you have 1
	42	epoll file descriptor that is monitoring n number of 'source file
	43	descriptors'. In this case, each 'source file descriptor' has a 1 path of
	44	length 1. Thus, I believe that the limits I'm proposing are quite
	45	reasonable and in fact may be too generous. Thus, I'm hoping that the
	46	proposed limits will not prevent any workloads that currently work to
	47	fail.
	48
	49	In terms of locking, I have extended the use of the 'epmutex' to all
	50	epoll_ctl add and remove operations. Currently its only used in a subset
	51	of the add paths. I need to hold the epmutex, so that we can correctly
	52	traverse a coherent graph, to check the number of paths. I believe that
	53	this additional locking is probably ok, since its in the setup/teardown
	54	paths, and doesn't affect the running paths, but it certainly is going to
	55	add some extra overhead. Also, worth noting is that the epmuex was
	56	recently added to the ep_ctl add operations in the initial path loop
	57	detection code using the argument that it was not on a critical path.
	58
	59	Another thing to note here, is the length of epoll chains that is allowed.
	60	Currently, eventpoll.c defines:
	61
	62	/* Maximum number of nesting allowed inside epoll sets */
	63	#define EP_MAX_NESTS 4
	64
	65	This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS
	66	+ 1). However, this limit is currently only enforced during the loop
	67	check detection code, and only when the epoll file descriptors are added
	68	in a certain order. Thus, this limit is currently easily bypassed. The
	69	newly added check for wakeup paths, stricly limits the wakeup paths to a
	70	length of 5, regardless of the order in which ep's are linked together.
	71	Thus, a side-effect of the new code is a more consistent enforcement of
	72	the graph depth.
	73
	74	Thus far, I've tested this, using the sample programs previously
	75	mentioned, which now either return quickly or return -EINVAL. I've also
	76	testing using the piptest.c epoll tester, which showed no difference in
	77	performance. I've also created a number of different epoll networks and
	78	tested that they behave as expectded.
	79
	80	I believe this solves the original diabolical test cases, while still
	81	preserving the sane epoll nesting.
	82
	83	Signed-off-by: Jason Baron <jbaron@redhat.com>
	84	Cc: Nelson Elhage <nelhage@ksplice.com>
	85	Cc: Davide Libenzi <davidel@xmailserver.org>
	86	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	87	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	88	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	89	---
	90	fs/eventpoll.c \| 234 ++++++++++++++++++++++++++++++++++++++++-----
	91	include/linux/eventpoll.h \| 1 +
	92	include/linux/fs.h \| 1 +
	93	3 files changed, 211 insertions(+), 25 deletions(-)
	94
	95	diff --git a/fs/eventpoll.c b/fs/eventpoll.c
	96	index 12a772b..ea54cde 100644
	97	--- a/fs/eventpoll.c
	98	+++ b/fs/eventpoll.c
	99	@@ -197,6 +197,12 @@ struct eventpoll {
	100
	101	/* The user that created the eventpoll descriptor */
	102	struct user_struct *user;
	103	+
	104	+ struct file *file;
	105	+
	106	+ /* used to optimize loop detection check */
	107	+ int visited;
	108	+ struct list_head visited_list_link;
	109	};
	110
	111	/* Wait structure used by the poll hooks */
	112	@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
	113	/* Slab cache used to allocate "struct eppoll_entry" */
	114	static struct kmem_cache *pwq_cache __read_mostly;
	115
	116	+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
	117	+static LIST_HEAD(visited_list);
	118	+
	119	+/*
	120	+ * List of files with newly added links, where we may need to limit the number
	121	+ * of emanating paths. Protected by the epmutex.
	122	+ */
	123	+static LIST_HEAD(tfile_check_list);
	124	+
	125	#ifdef CONFIG_SYSCTL
	126
	127	#include <linux/sysctl.h>
	128	@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
	129	};
	130	#endif /* CONFIG_SYSCTL */
	131
	132	+static const struct file_operations eventpoll_fops;
	133	+
	134	+static inline int is_file_epoll(struct file *f)
	135	+{
	136	+ return f->f_op == &eventpoll_fops;
	137	+}
	138
	139	/* Setup the structure that is used as key for the RB tree */
	140	static inline void ep_set_ffd(struct epoll_filefd *ffd,
	141	@@ -728,12 +749,6 @@ static const struct file_operations eventpoll_fops = {
	142	.llseek = noop_llseek,
	143	};
	144
	145	-/* Fast test to see if the file is an eventpoll file */
	146	-static inline int is_file_epoll(struct file *f)
	147	-{
	148	- return f->f_op == &eventpoll_fops;
	149	-}
	150	-
	151	/*
	152	* This is called from eventpoll_release() to unlink files from the eventpoll
	153	* interface. We need to have this facility to cleanup correctly files that are
	154	@@ -954,6 +969,99 @@ static void ep_rbtree_insert(struct eventpoll ep, struct epitem epi)
	155	rb_insert_color(&epi->rbn, &ep->rbr);
	156	}
	157
	158	+
	159	+
	160	+#define PATH_ARR_SIZE 5
	161	+/*
	162	+ * These are the number paths of length 1 to 5, that we are allowing to emanate
	163	+ * from a single file of interest. For example, we allow 1000 paths of length
	164	+ * 1, to emanate from each file of interest. This essentially represents the
	165	+ * potential wakeup paths, which need to be limited in order to avoid massive
	166	+ * uncontrolled wakeup storms. The common use case should be a single ep which
	167	+ * is connected to n file sources. In this case each file source has 1 path
	168	+ * of length 1. Thus, the numbers below should be more than sufficient. These
	169	+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
	170	+ * and delete can't add additional paths. Protected by the epmutex.
	171	+ */
	172	+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
	173	+static int path_count[PATH_ARR_SIZE];
	174	+
	175	+static int path_count_inc(int nests)
	176	+{
	177	+ if (++path_count[nests] > path_limits[nests])
	178	+ return -1;
	179	+ return 0;
	180	+}
	181	+
	182	+static void path_count_init(void)
	183	+{
	184	+ int i;
	185	+
	186	+ for (i = 0; i < PATH_ARR_SIZE; i++)
	187	+ path_count[i] = 0;
	188	+}
	189	+
	190	+static int reverse_path_check_proc(void priv, void cookie, int call_nests)
	191	+{
	192	+ int error = 0;
	193	+ struct file *file = priv;
	194	+ struct file *child_file;
	195	+ struct epitem *epi;
	196	+
	197	+ list_for_each_entry(epi, &file->f_ep_links, fllink) {
	198	+ child_file = epi->ep->file;
	199	+ if (is_file_epoll(child_file)) {
	200	+ if (list_empty(&child_file->f_ep_links)) {
	201	+ if (path_count_inc(call_nests)) {
	202	+ error = -1;
	203	+ break;
	204	+ }
	205	+ } else {
	206	+ error = ep_call_nested(&poll_loop_ncalls,
	207	+ EP_MAX_NESTS,
	208	+ reverse_path_check_proc,
	209	+ child_file, child_file,
	210	+ current);
	211	+ }
	212	+ if (error != 0)
	213	+ break;
	214	+ } else {
	215	+ printk(KERN_ERR "reverse_path_check_proc: "
	216	+ "file is not an ep!\n");
	217	+ }
	218	+ }
	219	+ return error;
	220	+}
	221	+
	222	+/**
	223	+ * reverse_path_check - The tfile_check_list is list of file *, which have
	224	+ * links that are proposed to be newly added. We need to
	225	+ * make sure that those added links don't add too many
	226	+ * paths such that we will spend all our time waking up
	227	+ * eventpoll objects.
	228	+ *
	229	+ * Returns: Returns zero if the proposed links don't create too many paths,
	230	+ * -1 otherwise.
	231	+ */
	232	+static int reverse_path_check(void)
	233	+{
	234	+ int length = 0;
	235	+ int error = 0;
	236	+ struct file *current_file;
	237	+
	238	+ /* let's call this for all tfiles */
	239	+ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
	240	+ length++;
	241	+ path_count_init();
	242	+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
	243	+ reverse_path_check_proc, current_file,
	244	+ current_file, current);
	245	+ if (error)
	246	+ break;
	247	+ }
	248	+ return error;
	249	+}
	250	+
	251	/*
	252	* Must be called with "mtx" held.
	253	*/
	254	@@ -1015,6 +1123,11 @@ static int ep_insert(struct eventpoll ep, struct epoll_event event,
	255	*/
	256	ep_rbtree_insert(ep, epi);
	257
	258	+ /* now check if we've created too many backpaths */
	259	+ error = -EINVAL;
	260	+ if (reverse_path_check())
	261	+ goto error_remove_epi;
	262	+
	263	/* We have to drop the new item inside our item list to keep track of it */
	264	spin_lock_irqsave(&ep->lock, flags);
	265
	266	@@ -1039,6 +1152,14 @@ static int ep_insert(struct eventpoll ep, struct epoll_event event,
	267
	268	return 0;
	269
	270	+error_remove_epi:
	271	+ spin_lock(&tfile->f_lock);
	272	+ if (ep_is_linked(&epi->fllink))
	273	+ list_del_init(&epi->fllink);
	274	+ spin_unlock(&tfile->f_lock);
	275	+
	276	+ rb_erase(&epi->rbn, &ep->rbr);
	277	+
	278	error_unregister:
	279	ep_unregister_pollwait(ep, epi);
	280
	281	@@ -1303,18 +1424,36 @@ static int ep_loop_check_proc(void priv, void cookie, int call_nests)
	282	int error = 0;
	283	struct file *file = priv;
	284	struct eventpoll *ep = file->private_data;
	285	+ struct eventpoll *ep_tovisit;
	286	struct rb_node *rbp;
	287	struct epitem *epi;
	288
	289	mutex_lock_nested(&ep->mtx, call_nests + 1);
	290	+ ep->visited = 1;
	291	+ list_add(&ep->visited_list_link, &visited_list);
	292	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
	293	epi = rb_entry(rbp, struct epitem, rbn);
	294	if (unlikely(is_file_epoll(epi->ffd.file))) {
	295	+ ep_tovisit = epi->ffd.file->private_data;
	296	+ if (ep_tovisit->visited)
	297	+ continue;
	298	error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
	299	- ep_loop_check_proc, epi->ffd.file,
	300	- epi->ffd.file->private_data, current);
	301	+ ep_loop_check_proc, epi->ffd.file,
	302	+ ep_tovisit, current);
	303	if (error != 0)
	304	break;
	305	+ } else {
	306	+ /*
	307	+ * If we've reached a file that is not associated with
	308	+ * an ep, then we need to check if the newly added
	309	+ * links are going to add too many wakeup paths. We do
	310	+ * this by adding it to the tfile_check_list, if it's
	311	+ * not already there, and calling reverse_path_check()
	312	+ * during ep_insert().
	313	+ */
	314	+ if (list_empty(&epi->ffd.file->f_tfile_llink))
	315	+ list_add(&epi->ffd.file->f_tfile_llink,
	316	+ &tfile_check_list);
	317	}
	318	}
	319	mutex_unlock(&ep->mtx);
	320	@@ -1335,8 +1474,31 @@ static int ep_loop_check_proc(void priv, void cookie, int call_nests)
	321	*/
	322	static int ep_loop_check(struct eventpoll ep, struct file file)
	323	{
	324	- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
	325	+ int ret;
	326	+ struct eventpoll ep_cur, ep_next;
	327	+
	328	+ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
	329	ep_loop_check_proc, file, ep, current);
	330	+ /* clear visited list */
	331	+ list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
	332	+ visited_list_link) {
	333	+ ep_cur->visited = 0;
	334	+ list_del(&ep_cur->visited_list_link);
	335	+ }
	336	+ return ret;
	337	+}
	338	+
	339	+static void clear_tfile_check_list(void)
	340	+{
	341	+ struct file *file;
	342	+
	343	+ /* first clear the tfile_check_list */
	344	+ while (!list_empty(&tfile_check_list)) {
	345	+ file = list_first_entry(&tfile_check_list, struct file,
	346	+ f_tfile_llink);
	347	+ list_del_init(&file->f_tfile_llink);
	348	+ }
	349	+ INIT_LIST_HEAD(&tfile_check_list);
	350	}
	351
	352	/*
	353	@@ -1344,8 +1506,9 @@ static int ep_loop_check(struct eventpoll ep, struct file file)
	354	*/
	355	SYSCALL_DEFINE1(epoll_create1, int, flags)
	356	{
	357	- int error;
	358	+ int error, fd;
	359	struct eventpoll *ep = NULL;
	360	+ struct file *file;
	361
	362	/* Check the EPOLL_* constant for consistency. */
	363	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
	364	@@ -1362,11 +1525,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
	365	* Creates all the items needed to setup an eventpoll file. That is,
	366	* a file structure and a free file descriptor.
	367	*/
	368	- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
	369	+ fd = get_unused_fd_flags(O_RDWR \| (flags & O_CLOEXEC));
	370	+ if (fd < 0) {
	371	+ error = fd;
	372	+ goto out_free_ep;
	373	+ }
	374	+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
	375	O_RDWR \| (flags & O_CLOEXEC));
	376	- if (error < 0)
	377	- ep_free(ep);
	378	-
	379	+ if (IS_ERR(file)) {
	380	+ error = PTR_ERR(file);
	381	+ goto out_free_fd;
	382	+ }
	383	+ fd_install(fd, file);
	384	+ ep->file = file;
	385	+ return fd;
	386	+
	387	+out_free_fd:
	388	+ put_unused_fd(fd);
	389	+out_free_ep:
	390	+ ep_free(ep);
	391	return error;
	392	}
	393
	394	@@ -1432,21 +1609,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	395	/*
	396	* When we insert an epoll file descriptor, inside another epoll file
	397	* descriptor, there is the change of creating closed loops, which are
	398	- * better be handled here, than in more critical paths.
	399	+ * better be handled here, than in more critical paths. While we are
	400	+ * checking for loops we also determine the list of files reachable
	401	+ * and hang them on the tfile_check_list, so we can check that we
	402	+ * haven't created too many possible wakeup paths.
	403	*
	404	- * We hold epmutex across the loop check and the insert in this case, in
	405	- * order to prevent two separate inserts from racing and each doing the
	406	- * insert "at the same time" such that ep_loop_check passes on both
	407	- * before either one does the insert, thereby creating a cycle.
	408	+ * We need to hold the epmutex across both ep_insert and ep_remove
	409	+ * b/c we want to make sure we are looking at a coherent view of
	410	+ * epoll network.
	411	*/
	412	- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
	413	+ if (op == EPOLL_CTL_ADD \|\| op == EPOLL_CTL_DEL) {
	414	mutex_lock(&epmutex);
	415	did_lock_epmutex = 1;
	416	- error = -ELOOP;
	417	- if (ep_loop_check(ep, tfile) != 0)
	418	- goto error_tgt_fput;
	419	}
	420	-
	421	+ if (op == EPOLL_CTL_ADD) {
	422	+ if (is_file_epoll(tfile)) {
	423	+ error = -ELOOP;
	424	+ if (ep_loop_check(ep, tfile) != 0)
	425	+ goto error_tgt_fput;
	426	+ } else
	427	+ list_add(&tfile->f_tfile_llink, &tfile_check_list);
	428	+ }
	429
	430	mutex_lock_nested(&ep->mtx, 0);
	431
	432	@@ -1465,6 +1648,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	433	error = ep_insert(ep, &epds, tfile, fd);
	434	} else
	435	error = -EEXIST;
	436	+ clear_tfile_check_list();
	437	break;
	438	case EPOLL_CTL_DEL:
	439	if (epi)
	440	@@ -1483,7 +1667,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	441	mutex_unlock(&ep->mtx);
	442
	443	error_tgt_fput:
	444	- if (unlikely(did_lock_epmutex))
	445	+ if (did_lock_epmutex)
	446	mutex_unlock(&epmutex);
	447
	448	fput(tfile);
	449	diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
	450	index f362733..657ab55 100644
	451	--- a/include/linux/eventpoll.h
	452	+++ b/include/linux/eventpoll.h
	453	@@ -61,6 +61,7 @@ struct file;
	454	static inline void eventpoll_init_file(struct file *file)
	455	{
	456	INIT_LIST_HEAD(&file->f_ep_links);
	457	+ INIT_LIST_HEAD(&file->f_tfile_llink);
	458	}
	459
	460
	461	diff --git a/include/linux/fs.h b/include/linux/fs.h
	462	index e0bc4ff..10b2288 100644
	463	--- a/include/linux/fs.h
	464	+++ b/include/linux/fs.h
	465	@@ -1001,6 +1001,7 @@ struct file {
	466	#ifdef CONFIG_EPOLL
	467	/* Used by fs/eventpoll.c to link all the hooks to this file */
	468	struct list_head f_ep_links;
	469	+ struct list_head f_tfile_llink;
	470	#endif /* #ifdef CONFIG_EPOLL */
	471	struct address_space *f_mapping;
	472	#ifdef CONFIG_DEBUG_WRITECOUNT
	473	--
	474	1.7.9.4
	475