· 7 years ago · Feb 23, 2019, 02:24 AM
1diff -ruNb a//Documentation/filesystems/Locking b//Documentation/filesystems/Locking
2--- a//Documentation/filesystems/Locking 2012-10-12 21:48:25.000000000 +0100
3+++ b//Documentation/filesystems/Locking 2012-10-21 15:32:26.594986267 +0100
4@@ -62,6 +62,7 @@
5 int (*removexattr) (struct dentry *, const char *);
6 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
7 void (*update_time)(struct inode *, struct timespec *, int);
8+ struct file *(*open)(struct dentry *,struct file *,const struct cred *);
9
10 locking rules:
11 all may block
12@@ -89,7 +90,7 @@
13 removexattr: yes
14 fiemap: no
15 update_time: no
16-
17+open: no
18 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
19 victim.
20 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
21diff -ruNb a//Documentation/filesystems/overlayfs.txt b//Documentation/filesystems/overlayfs.txt
22--- a//Documentation/filesystems/overlayfs.txt 1970-01-01 01:00:00.000000000 +0100
23+++ b//Documentation/filesystems/overlayfs.txt 2012-10-21 15:34:04.765813939 +0100
24@@ -0,0 +1,199 @@
25+Written by: Neil Brown <neilb@suse.de>
26+
27+Overlay Filesystem
28+==================
29+
30+This document describes a prototype for a new approach to providing
31+overlay-filesystem functionality in Linux (sometimes referred to as
32+union-filesystems). An overlay-filesystem tries to present a
33+filesystem which is the result over overlaying one filesystem on top
34+of the other.
35+
36+The result will inevitably fail to look exactly like a normal
37+filesystem for various technical reasons. The expectation is that
38+many use cases will be able to ignore these differences.
39+
40+This approach is 'hybrid' because the objects that appear in the
41+filesystem do not all appear to belong to that filesystem. In many
42+cases an object accessed in the union will be indistinguishable
43+from accessing the corresponding object from the original filesystem.
44+This is most obvious from the 'st_dev' field returned by stat(2).
45+
46+While directories will report an st_dev from the overlay-filesystem,
47+all non-directory objects will report an st_dev from the lower or
48+upper filesystem that is providing the object. Similarly st_ino will
49+only be unique when combined with st_dev, and both of these can change
50+over the lifetime of a non-directory object. Many applications and
51+tools ignore these values and will not be affected.
52+
53+Upper and Lower
54+---------------
55+
56+An overlay filesystem combines two filesystems - an 'upper' filesystem
57+and a 'lower' filesystem. When a name exists in both filesystems, the
58+object in the 'upper' filesystem is visible while the object in the
59+'lower' filesystem is either hidden or, in the case of directories,
60+merged with the 'upper' object.
61+
62+It would be more correct to refer to an upper and lower 'directory
63+tree' rather than 'filesystem' as it is quite possible for both
64+directory trees to be in the same filesystem and there is no
65+requirement that the root of a filesystem be given for either upper or
66+lower.
67+
68+The lower filesystem can be any filesystem supported by Linux and does
69+not need to be writable. The lower filesystem can even be another
70+overlayfs. The upper filesystem will normally be writable and if it
71+is it must support the creation of trusted.* extended attributes, and
72+must provide valid d_type in readdir responses, at least for symbolic
73+links - so NFS is not suitable.
74+
75+A read-only overlay of two read-only filesystems may use any
76+filesystem type.
77+
78+Directories
79+-----------
80+
81+Overlaying mainly involved directories. If a given name appears in both
82+upper and lower filesystems and refers to a non-directory in either,
83+then the lower object is hidden - the name refers only to the upper
84+object.
85+
86+Where both upper and lower objects are directories, a merged directory
87+is formed.
88+
89+At mount time, the two directories given as mount options are combined
90+into a merged directory:
91+
92+ mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay
93+
94+Then whenever a lookup is requested in such a merged directory, the
95+lookup is performed in each actual directory and the combined result
96+is cached in the dentry belonging to the overlay filesystem. If both
97+actual lookups find directories, both are stored and a merged
98+directory is created, otherwise only one is stored: the upper if it
99+exists, else the lower.
100+
101+Only the lists of names from directories are merged. Other content
102+such as metadata and extended attributes are reported for the upper
103+directory only. These attributes of the lower directory are hidden.
104+
105+whiteouts and opaque directories
106+--------------------------------
107+
108+In order to support rm and rmdir without changing the lower
109+filesystem, an overlay filesystem needs to record in the upper filesystem
110+that files have been removed. This is done using whiteouts and opaque
111+directories (non-directories are always opaque).
112+
113+The overlay filesystem uses extended attributes with a
114+"trusted.overlay." prefix to record these details.
115+
116+A whiteout is created as a symbolic link with target
117+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y".
118+When a whiteout is found in the upper level of a merged directory, any
119+matching name in the lower level is ignored, and the whiteout itself
120+is also hidden.
121+
122+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
123+to "y". Where the upper filesystem contains an opaque directory, any
124+directory in the lower filesystem with the same name is ignored.
125+
126+readdir
127+-------
128+
129+When a 'readdir' request is made on a merged directory, the upper and
130+lower directories are each read and the name lists merged in the
131+obvious way (upper is read first, then lower - entries that already
132+exist are not re-added). This merged name list is cached in the
133+'struct file' and so remains as long as the file is kept open. If the
134+directory is opened and read by two processes at the same time, they
135+will each have separate caches. A seekdir to the start of the
136+directory (offset 0) followed by a readdir will cause the cache to be
137+discarded and rebuilt.
138+
139+This means that changes to the merged directory do not appear while a
140+directory is being read. This is unlikely to be noticed by many
141+programs.
142+
143+seek offsets are assigned sequentially when the directories are read.
144+Thus if
145+ - read part of a directory
146+ - remember an offset, and close the directory
147+ - re-open the directory some time later
148+ - seek to the remembered offset
149+
150+there may be little correlation between the old and new locations in
151+the list of filenames, particularly if anything has changed in the
152+directory.
153+
154+Readdir on directories that are not merged is simply handled by the
155+underlying directory (upper or lower).
156+
157+
158+Non-directories
159+---------------
160+
161+Objects that are not directories (files, symlinks, device-special
162+files etc.) are presented either from the upper or lower filesystem as
163+appropriate. When a file in the lower filesystem is accessed in a way
164+the requires write-access, such as opening for write access, changing
165+some metadata etc., the file is first copied from the lower filesystem
166+to the upper filesystem (copy_up). Note that creating a hard-link
167+also requires copy_up, though of course creation of a symlink does
168+not.
169+
170+The copy_up may turn out to be unnecessary, for example if the file is
171+opened for read-write but the data is not modified.
172+
173+The copy_up process first makes sure that the containing directory
174+exists in the upper filesystem - creating it and any parents as
175+necessary. It then creates the object with the same metadata (owner,
176+mode, mtime, symlink-target etc.) and then if the object is a file, the
177+data is copied from the lower to the upper filesystem. Finally any
178+extended attributes are copied up.
179+
180+Once the copy_up is complete, the overlay filesystem simply
181+provides direct access to the newly created file in the upper
182+filesystem - future operations on the file are barely noticed by the
183+overlay filesystem (though an operation on the name of the file such as
184+rename or unlink will of course be noticed and handled).
185+
186+
187+Non-standard behavior
188+---------------------
189+
190+The copy_up operation essentially creates a new, identical file and
191+moves it over to the old name. The new file may be on a different
192+filesystem, so both st_dev and st_ino of the file may change.
193+
194+Any open files referring to this inode will access the old data and
195+metadata. Similarly any file locks obtained before copy_up will not
196+apply to the copied up file.
197+
198+On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2)
199+and fsetxattr(2) will fail with EROFS.
200+
201+If a file with multiple hard links is copied up, then this will
202+"break" the link. Changes will not be propagated to other names
203+referring to the same inode.
204+
205+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
206+object in overlayfs will not contain vaid absolute paths, only
207+relative paths leading up to the filesystem's root. This will be
208+fixed in the future.
209+
210+Some operations are not atomic, for example a crash during copy_up or
211+rename will leave the filesystem in an inconsitent state. This will
212+be addressed in the future.
213+
214+Changes to underlying filesystems
215+---------------------------------
216+
217+Offline changes, when the overlay is not mounted, are allowed to either
218+the upper or the lower trees.
219+
220+Changes to the underlying filesystems while part of a mounted overlay
221+filesystem are not allowed. If the underlying filesystem is changed,
222+the behavior of the overlay is undefined, though it will not result in
223+a crash or deadlock.
224diff -ruNb a//Documentation/filesystems/vfs.txt b//Documentation/filesystems/vfs.txt
225--- a//Documentation/filesystems/vfs.txt 2012-10-12 21:48:25.000000000 +0100
226+++ b//Documentation/filesystems/vfs.txt 2012-10-21 15:32:26.595986134 +0100
227@@ -364,6 +364,8 @@
228 ssize_t (*listxattr) (struct dentry *, char *, size_t);
229 int (*removexattr) (struct dentry *, const char *);
230 void (*update_time)(struct inode *, struct timespec *, int);
231+ struct file *(*open) (struct dentry *, struct file *,
232+ const struct cred *);
233 };
234
235 Again, all methods are called without any locks being held, unless
236@@ -476,6 +478,12 @@
237 an inode. If this is not defined the VFS will update the inode itself
238 and call mark_inode_dirty_sync.
239
240+ open: this is an alternative to f_op->open(), the difference is that this
241+ method may return any open file, not necessarily originating from the
242+ same filesystem as the one i_op->open() was called on. It may be useful
243+ for stacking filesystems which want to allow native I/O directly on
244+ underlying files.
245+
246 The Address Space Object
247 ========================
248
249diff -ruNb a//fs/ecryptfs/main.c b//fs/ecryptfs/main.c
250--- a//fs/ecryptfs/main.c 2012-10-12 21:48:25.000000000 +0100
251+++ b//fs/ecryptfs/main.c 2012-10-21 15:34:17.524102063 +0100
252@@ -544,6 +544,13 @@
253 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
254 s->s_blocksize = path.dentry->d_sb->s_blocksize;
255 s->s_magic = ECRYPTFS_SUPER_MAGIC;
256+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
257+
258+ rc = -EINVAL;
259+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
260+ printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n");
261+ goto out_free;
262+ }
263
264 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
265 rc = PTR_ERR(inode);
266diff -ruNb a//fs/Kconfig b//fs/Kconfig
267--- a//fs/Kconfig 2012-10-12 21:48:25.000000000 +0100
268+++ b//fs/Kconfig 2012-10-21 15:33:23.868301470 +0100
269@@ -67,6 +67,7 @@
270
271 source "fs/autofs4/Kconfig"
272 source "fs/fuse/Kconfig"
273+source "fs/overlayfs/Kconfig"
274
275 config CUSE
276 tristate "Character device in Userspace support"
277diff -ruNb a//fs/Makefile b//fs/Makefile
278--- a//fs/Makefile 2012-10-12 21:48:25.000000000 +0100
279+++ b//fs/Makefile 2012-10-21 15:33:23.868301470 +0100
280@@ -106,6 +106,7 @@
281 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
282 obj-$(CONFIG_ADFS_FS) += adfs/
283 obj-$(CONFIG_FUSE_FS) += fuse/
284+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
285 obj-$(CONFIG_UDF_FS) += udf/
286 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
287 obj-$(CONFIG_OMFS_FS) += omfs/
288diff -ruNb a//fs/namei.c b//fs/namei.c
289--- a//fs/namei.c 2012-10-12 21:48:25.000000000 +0100
290+++ b//fs/namei.c 2012-10-21 15:35:00.151382436 +0100
291@@ -315,6 +315,36 @@
292 }
293
294 /**
295+ * inode_only_permission - check access rights to a given inode only
296+ * @inode: inode to check permissions on
297+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
298+ *
299+ * Uses to check read/write/execute permissions on an inode directly, we do
300+ * not check filesystem permissions.
301+ */
302+int inode_only_permission(struct inode *inode, int mask)
303+{
304+ int retval;
305+
306+ /*
307+ * Nobody gets write access to an immutable file.
308+ */
309+ if (unlikely(mask & MAY_WRITE) && IS_IMMUTABLE(inode))
310+ return -EACCES;
311+
312+ retval = do_inode_permission(inode, mask);
313+ if (retval)
314+ return retval;
315+
316+ retval = devcgroup_inode_permission(inode, mask);
317+ if (retval)
318+ return retval;
319+
320+ return security_inode_permission(inode, mask);
321+}
322+EXPORT_SYMBOL(inode_only_permission);
323+
324+/**
325 * inode_permission - check for access rights to a given inode
326 * @inode: inode to check permission on
327 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
328@@ -328,8 +358,6 @@
329 */
330 int inode_permission(struct inode *inode, int mask)
331 {
332- int retval;
333-
334 if (unlikely(mask & MAY_WRITE)) {
335 umode_t mode = inode->i_mode;
336
337@@ -339,23 +367,9 @@
338 if (IS_RDONLY(inode) &&
339 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
340 return -EROFS;
341-
342- /*
343- * Nobody gets write access to an immutable file.
344- */
345- if (IS_IMMUTABLE(inode))
346- return -EACCES;
347 }
348
349- retval = do_inode_permission(inode, mask);
350- if (retval)
351- return retval;
352-
353- retval = devcgroup_inode_permission(inode, mask);
354- if (retval)
355- return retval;
356-
357- return security_inode_permission(inode, mask);
358+ return inode_only_permission(inode, mask);
359 }
360
361 /**
362diff -ruNb a//fs/namespace.c b//fs/namespace.c
363--- a//fs/namespace.c 2012-10-12 21:48:25.000000000 +0100
364+++ b//fs/namespace.c 2012-10-21 15:33:09.262261274 +0100
365@@ -1327,6 +1327,24 @@
366 release_mounts(&umount_list);
367 }
368
369+struct vfsmount *clone_private_mount(struct path *path)
370+{
371+ struct mount *old_mnt = real_mount(path->mnt);
372+ struct mount *new_mnt;
373+
374+ if (IS_MNT_UNBINDABLE(old_mnt))
375+ return ERR_PTR(-EINVAL);
376+
377+ down_read(&namespace_sem);
378+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
379+ up_read(&namespace_sem);
380+ if (!new_mnt)
381+ return ERR_PTR(-ENOMEM);
382+
383+ return &new_mnt->mnt;
384+}
385+EXPORT_SYMBOL_GPL(clone_private_mount);
386+
387 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
388 struct vfsmount *root)
389 {
390diff -ruNb a//fs/open.c b//fs/open.c
391--- a//fs/open.c 2012-10-12 21:48:25.000000000 +0100
392+++ b//fs/open.c 2012-10-21 15:32:26.596986001 +0100
393@@ -667,8 +667,7 @@
394 return 0;
395 }
396
397-static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt,
398- struct file *f,
399+static struct file *do_dentry_open(struct path *path, struct file *f,
400 int (*open)(struct inode *, struct file *),
401 const struct cred *cred)
402 {
403@@ -676,15 +675,16 @@
404 struct inode *inode;
405 int error;
406
407+ path_get(path);
408 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
409 FMODE_PREAD | FMODE_PWRITE;
410
411 if (unlikely(f->f_flags & O_PATH))
412 f->f_mode = FMODE_PATH;
413
414- inode = dentry->d_inode;
415+ inode = path->dentry->d_inode;
416 if (f->f_mode & FMODE_WRITE) {
417- error = __get_file_write_access(inode, mnt);
418+ error = __get_file_write_access(inode, path->mnt);
419 if (error)
420 goto cleanup_file;
421 if (!special_file(inode->i_mode))
422@@ -692,8 +692,7 @@
423 }
424
425 f->f_mapping = inode->i_mapping;
426- f->f_path.dentry = dentry;
427- f->f_path.mnt = mnt;
428+ f->f_path = *path;
429 f->f_pos = 0;
430 file_sb_list_add(f, inode->i_sb);
431
432@@ -740,24 +739,22 @@
433 * here, so just reset the state.
434 */
435 file_reset_write(f);
436- mnt_drop_write(mnt);
437+ mnt_drop_write(path->mnt);
438 }
439 }
440 file_sb_list_del(f);
441 f->f_path.dentry = NULL;
442 f->f_path.mnt = NULL;
443 cleanup_file:
444- dput(dentry);
445- mntput(mnt);
446+ path_put(path);
447 return ERR_PTR(error);
448 }
449
450-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
451- struct file *f,
452+static struct file *__dentry_open(struct path *path, struct file *f,
453 int (*open)(struct inode *, struct file *),
454 const struct cred *cred)
455 {
456- struct file *res = do_dentry_open(dentry, mnt, f, open, cred);
457+ struct file *res = do_dentry_open(path, f, open, cred);
458 if (!IS_ERR(res)) {
459 int error = open_check_o_direct(f);
460 if (error) {
461@@ -792,14 +789,14 @@
462 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
463 int (*open)(struct inode *, struct file *))
464 {
465+ struct path path = { .dentry = dentry, .mnt = nd->path.mnt };
466 const struct cred *cred = current_cred();
467
468 if (IS_ERR(nd->intent.open.file))
469 goto out;
470 if (IS_ERR(dentry))
471 goto out_err;
472- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
473- nd->intent.open.file,
474+ nd->intent.open.file = __dentry_open(&path, nd->intent.open.file,
475 open, cred);
476 out:
477 return nd->intent.open.file;
478@@ -831,9 +828,7 @@
479 } else {
480 struct file *res;
481
482- path_get(&nd->path);
483- res = do_dentry_open(nd->path.dentry, nd->path.mnt,
484- filp, NULL, cred);
485+ res = vfs_open(&nd->path, filp, cred);
486 if (!IS_ERR(res)) {
487 int error;
488
489@@ -860,27 +855,48 @@
490 struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
491 const struct cred *cred)
492 {
493- int error;
494 struct file *f;
495+ struct file *ret;
496+ struct path path = { .dentry = dentry, .mnt = mnt };
497
498 validate_creds(cred);
499
500 /* We must always pass in a valid mount pointer. */
501 BUG_ON(!mnt);
502
503- error = -ENFILE;
504+ ret = ERR_PTR(-ENFILE);
505 f = get_empty_filp();
506- if (f == NULL) {
507- dput(dentry);
508- mntput(mnt);
509- return ERR_PTR(error);
510+ if (f != NULL) {
511+ f->f_flags = flags;
512+ ret = vfs_open(&path, f, cred);
513 }
514+ path_put(&path);
515
516- f->f_flags = flags;
517- return __dentry_open(dentry, mnt, f, NULL, cred);
518+ return ret;
519 }
520 EXPORT_SYMBOL(dentry_open);
521
522+/**
523+ * vfs_open - open the file at the given path
524+ * @path: path to open
525+ * @filp: newly allocated file with f_flag initialized
526+ * @cred: credentials to use
527+ *
528+ * Open the file. If successful, the returned file will have acquired
529+ * an additional reference for path.
530+ */
531+struct file *vfs_open(struct path *path, struct file *filp,
532+ const struct cred *cred)
533+{
534+ struct inode *inode = path->dentry->d_inode;
535+
536+ if (inode->i_op->open)
537+ return inode->i_op->open(path->dentry, filp, cred);
538+ else
539+ return __dentry_open(path, filp, NULL, cred);
540+}
541+EXPORT_SYMBOL(vfs_open);
542+
543 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
544 {
545 struct fdtable *fdt = files_fdtable(files);
546diff -ruNb a//fs/overlayfs/copy_up.c b//fs/overlayfs/copy_up.c
547--- a//fs/overlayfs/copy_up.c 1970-01-01 01:00:00.000000000 +0100
548+++ b//fs/overlayfs/copy_up.c 2012-10-21 15:33:23.868301470 +0100
549@@ -0,0 +1,385 @@
550+/*
551+ *
552+ * Copyright (C) 2011 Novell Inc.
553+ *
554+ * This program is free software; you can redistribute it and/or modify it
555+ * under the terms of the GNU General Public License version 2 as published by
556+ * the Free Software Foundation.
557+ */
558+
559+#include <linux/fs.h>
560+#include <linux/slab.h>
561+#include <linux/file.h>
562+#include <linux/splice.h>
563+#include <linux/xattr.h>
564+#include <linux/security.h>
565+#include <linux/uaccess.h>
566+#include <linux/sched.h>
567+#include "overlayfs.h"
568+
569+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
570+
571+static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)
572+{
573+ ssize_t list_size, size;
574+ char *buf, *name, *value;
575+ int error;
576+
577+ if (!old->d_inode->i_op->getxattr ||
578+ !new->d_inode->i_op->getxattr)
579+ return 0;
580+
581+ list_size = vfs_listxattr(old, NULL, 0);
582+ if (list_size <= 0) {
583+ if (list_size == -EOPNOTSUPP)
584+ return 0;
585+ return list_size;
586+ }
587+
588+ buf = kzalloc(list_size, GFP_KERNEL);
589+ if (!buf)
590+ return -ENOMEM;
591+
592+ error = -ENOMEM;
593+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
594+ if (!value)
595+ goto out;
596+
597+ list_size = vfs_listxattr(old, buf, list_size);
598+ if (list_size <= 0) {
599+ error = list_size;
600+ goto out_free_value;
601+ }
602+
603+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
604+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
605+ if (size <= 0) {
606+ error = size;
607+ goto out_free_value;
608+ }
609+ error = vfs_setxattr(new, name, value, size, 0);
610+ if (error)
611+ goto out_free_value;
612+ }
613+
614+out_free_value:
615+ kfree(value);
616+out:
617+ kfree(buf);
618+ return error;
619+}
620+
621+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
622+{
623+ struct file *old_file;
624+ struct file *new_file;
625+ int error = 0;
626+
627+ if (len == 0)
628+ return 0;
629+
630+ old_file = ovl_path_open(old, O_RDONLY);
631+ if (IS_ERR(old_file))
632+ return PTR_ERR(old_file);
633+
634+ new_file = ovl_path_open(new, O_WRONLY);
635+ if (IS_ERR(new_file)) {
636+ error = PTR_ERR(new_file);
637+ goto out_fput;
638+ }
639+
640+ /* FIXME: copy up sparse files efficiently */
641+ while (len) {
642+ loff_t offset = new_file->f_pos;
643+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
644+ long bytes;
645+
646+ if (len < this_len)
647+ this_len = len;
648+
649+ if (signal_pending_state(TASK_KILLABLE, current)) {
650+ error = -EINTR;
651+ break;
652+ }
653+
654+ bytes = do_splice_direct(old_file, &offset, new_file, this_len,
655+ SPLICE_F_MOVE);
656+ if (bytes <= 0) {
657+ error = bytes;
658+ break;
659+ }
660+
661+ len -= bytes;
662+ }
663+
664+ fput(new_file);
665+out_fput:
666+ fput(old_file);
667+ return error;
668+}
669+
670+static char *ovl_read_symlink(struct dentry *realdentry)
671+{
672+ int res;
673+ char *buf;
674+ struct inode *inode = realdentry->d_inode;
675+ mm_segment_t old_fs;
676+
677+ res = -EINVAL;
678+ if (!inode->i_op->readlink)
679+ goto err;
680+
681+ res = -ENOMEM;
682+ buf = (char *) __get_free_page(GFP_KERNEL);
683+ if (!buf)
684+ goto err;
685+
686+ old_fs = get_fs();
687+ set_fs(get_ds());
688+ /* The cast to a user pointer is valid due to the set_fs() */
689+ res = inode->i_op->readlink(realdentry,
690+ (char __user *)buf, PAGE_SIZE - 1);
691+ set_fs(old_fs);
692+ if (res < 0) {
693+ free_page((unsigned long) buf);
694+ goto err;
695+ }
696+ buf[res] = '\0';
697+
698+ return buf;
699+
700+err:
701+ return ERR_PTR(res);
702+}
703+
704+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
705+{
706+ struct iattr attr = {
707+ .ia_valid =
708+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
709+ .ia_atime = stat->atime,
710+ .ia_mtime = stat->mtime,
711+ };
712+
713+ return notify_change(upperdentry, &attr);
714+}
715+
716+static int ovl_set_mode(struct dentry *upperdentry, umode_t mode)
717+{
718+ struct iattr attr = {
719+ .ia_valid = ATTR_MODE,
720+ .ia_mode = mode,
721+ };
722+
723+ return notify_change(upperdentry, &attr);
724+}
725+
726+static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,
727+ struct path *lowerpath, struct kstat *stat,
728+ const char *link)
729+{
730+ int err;
731+ struct path newpath;
732+ umode_t mode = stat->mode;
733+
734+ /* Can't properly set mode on creation because of the umask */
735+ stat->mode &= S_IFMT;
736+
737+ ovl_path_upper(dentry, &newpath);
738+ WARN_ON(newpath.dentry);
739+ newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link);
740+ if (IS_ERR(newpath.dentry))
741+ return PTR_ERR(newpath.dentry);
742+
743+ if (S_ISREG(stat->mode)) {
744+ err = ovl_copy_up_data(lowerpath, &newpath, stat->size);
745+ if (err)
746+ goto err_remove;
747+ }
748+
749+ err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry);
750+ if (err)
751+ goto err_remove;
752+
753+ mutex_lock(&newpath.dentry->d_inode->i_mutex);
754+ if (!S_ISLNK(stat->mode))
755+ err = ovl_set_mode(newpath.dentry, mode);
756+ if (!err)
757+ err = ovl_set_timestamps(newpath.dentry, stat);
758+ mutex_unlock(&newpath.dentry->d_inode->i_mutex);
759+ if (err)
760+ goto err_remove;
761+
762+ ovl_dentry_update(dentry, newpath.dentry);
763+
764+ /*
765+ * Easiest way to get rid of the lower dentry reference is to
766+ * drop this dentry. This is neither needed nor possible for
767+ * directories.
768+ */
769+ if (!S_ISDIR(stat->mode))
770+ d_drop(dentry);
771+
772+ return 0;
773+
774+err_remove:
775+ if (S_ISDIR(stat->mode))
776+ vfs_rmdir(upperdir->d_inode, newpath.dentry);
777+ else
778+ vfs_unlink(upperdir->d_inode, newpath.dentry);
779+
780+ dput(newpath.dentry);
781+
782+ return err;
783+}
784+
785+/*
786+ * Copy up a single dentry
787+ *
788+ * Directory renames only allowed on "pure upper" (already created on
789+ * upper filesystem, never copied up). Directories which are on lower or
790+ * are merged may not be renamed. For these -EXDEV is returned and
791+ * userspace has to deal with it. This means, when copying up a
792+ * directory we can rely on it and ancestors being stable.
793+ *
794+ * Non-directory renames start with copy up of source if necessary. The
795+ * actual rename will only proceed once the copy up was successful. Copy
796+ * up uses upper parent i_mutex for exclusion. Since rename can change
797+ * d_parent it is possible that the copy up will lock the old parent. At
798+ * that point the file will have already been copied up anyway.
799+ */
800+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
801+ struct path *lowerpath, struct kstat *stat)
802+{
803+ int err;
804+ struct kstat pstat;
805+ struct path parentpath;
806+ struct dentry *upperdir;
807+ const struct cred *old_cred;
808+ struct cred *override_cred;
809+ char *link = NULL;
810+
811+ ovl_path_upper(parent, &parentpath);
812+ upperdir = parentpath.dentry;
813+
814+ err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat);
815+ if (err)
816+ return err;
817+
818+ if (S_ISLNK(stat->mode)) {
819+ link = ovl_read_symlink(lowerpath->dentry);
820+ if (IS_ERR(link))
821+ return PTR_ERR(link);
822+ }
823+
824+ err = -ENOMEM;
825+ override_cred = prepare_creds();
826+ if (!override_cred)
827+ goto out_free_link;
828+
829+ override_cred->fsuid = stat->uid;
830+ override_cred->fsgid = stat->gid;
831+ /*
832+ * CAP_SYS_ADMIN for copying up extended attributes
833+ * CAP_DAC_OVERRIDE for create
834+ * CAP_FOWNER for chmod, timestamp update
835+ * CAP_FSETID for chmod
836+ * CAP_MKNOD for mknod
837+ */
838+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
839+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
840+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
841+ cap_raise(override_cred->cap_effective, CAP_FSETID);
842+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
843+ old_cred = override_creds(override_cred);
844+
845+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
846+ if (ovl_path_type(dentry) != OVL_PATH_LOWER) {
847+ err = 0;
848+ } else {
849+ err = ovl_copy_up_locked(upperdir, dentry, lowerpath,
850+ stat, link);
851+ if (!err) {
852+ /* Restore timestamps on parent (best effort) */
853+ ovl_set_timestamps(upperdir, &pstat);
854+ }
855+ }
856+
857+ mutex_unlock(&upperdir->d_inode->i_mutex);
858+
859+ revert_creds(old_cred);
860+ put_cred(override_cred);
861+
862+out_free_link:
863+ if (link)
864+ free_page((unsigned long) link);
865+
866+ return err;
867+}
868+
869+int ovl_copy_up(struct dentry *dentry)
870+{
871+ int err;
872+
873+ err = 0;
874+ while (!err) {
875+ struct dentry *next;
876+ struct dentry *parent;
877+ struct path lowerpath;
878+ struct kstat stat;
879+ enum ovl_path_type type = ovl_path_type(dentry);
880+
881+ if (type != OVL_PATH_LOWER)
882+ break;
883+
884+ next = dget(dentry);
885+ /* find the topmost dentry not yet copied up */
886+ for (;;) {
887+ parent = dget_parent(next);
888+
889+ type = ovl_path_type(parent);
890+ if (type != OVL_PATH_LOWER)
891+ break;
892+
893+ dput(next);
894+ next = parent;
895+ }
896+
897+ ovl_path_lower(next, &lowerpath);
898+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
899+ if (!err)
900+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
901+
902+ dput(parent);
903+ dput(next);
904+ }
905+
906+ return err;
907+}
908+
909+/* Optimize by not copying up the file first and truncating later */
910+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)
911+{
912+ int err;
913+ struct kstat stat;
914+ struct path lowerpath;
915+ struct dentry *parent = dget_parent(dentry);
916+
917+ err = ovl_copy_up(parent);
918+ if (err)
919+ goto out_dput_parent;
920+
921+ ovl_path_lower(dentry, &lowerpath);
922+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
923+ if (err)
924+ goto out_dput_parent;
925+
926+ if (size < stat.size)
927+ stat.size = size;
928+
929+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
930+
931+out_dput_parent:
932+ dput(parent);
933+ return err;
934+}
935diff -ruNb a//fs/overlayfs/dir.c b//fs/overlayfs/dir.c
936--- a//fs/overlayfs/dir.c 1970-01-01 01:00:00.000000000 +0100
937+++ b//fs/overlayfs/dir.c 2012-10-21 15:35:40.472972180 +0100
938@@ -0,0 +1,604 @@
939+/*
940+ *
941+ * Copyright (C) 2011 Novell Inc.
942+ *
943+ * This program is free software; you can redistribute it and/or modify it
944+ * under the terms of the GNU General Public License version 2 as published by
945+ * the Free Software Foundation.
946+ */
947+
948+#include <linux/fs.h>
949+#include <linux/namei.h>
950+#include <linux/xattr.h>
951+#include <linux/security.h>
952+#include <linux/cred.h>
953+#include "overlayfs.h"
954+
955+static const char *ovl_whiteout_symlink = "(overlay-whiteout)";
956+
957+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)
958+{
959+ int err;
960+ struct dentry *newdentry;
961+ const struct cred *old_cred;
962+ struct cred *override_cred;
963+
964+ /* FIXME: recheck lower dentry to see if whiteout is really needed */
965+
966+ err = -ENOMEM;
967+ override_cred = prepare_creds();
968+ if (!override_cred)
969+ goto out;
970+
971+ /*
972+ * CAP_SYS_ADMIN for setxattr
973+ * CAP_DAC_OVERRIDE for symlink creation
974+ * CAP_FOWNER for unlink in sticky directory
975+ */
976+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
977+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
978+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
979+ override_cred->fsuid = 0;
980+ override_cred->fsgid = 0;
981+ old_cred = override_creds(override_cred);
982+
983+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
984+ dentry->d_name.len);
985+ err = PTR_ERR(newdentry);
986+ if (IS_ERR(newdentry))
987+ goto out_put_cred;
988+
989+ /* Just been removed within the same locked region */
990+ WARN_ON(newdentry->d_inode);
991+
992+ err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);
993+ if (err)
994+ goto out_dput;
995+
996+ ovl_dentry_version_inc(dentry->d_parent);
997+
998+ err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0);
999+ if (err)
1000+ vfs_unlink(upperdir->d_inode, newdentry);
1001+
1002+out_dput:
1003+ dput(newdentry);
1004+out_put_cred:
1005+ revert_creds(old_cred);
1006+ put_cred(override_cred);
1007+out:
1008+ if (err) {
1009+ /*
1010+ * There's no way to recover from failure to whiteout.
1011+ * What should we do? Log a big fat error and... ?
1012+ */
1013+ printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n",
1014+ dentry->d_name.name);
1015+ }
1016+
1017+ return err;
1018+}
1019+
1020+static struct dentry *ovl_lookup_create(struct dentry *upperdir,
1021+ struct dentry *template)
1022+{
1023+ int err;
1024+ struct dentry *newdentry;
1025+ struct qstr *name = &template->d_name;
1026+
1027+ newdentry = lookup_one_len(name->name, upperdir, name->len);
1028+ if (IS_ERR(newdentry))
1029+ return newdentry;
1030+
1031+ if (newdentry->d_inode) {
1032+ const struct cred *old_cred;
1033+ struct cred *override_cred;
1034+
1035+ /* No need to check whiteout if lower parent is non-existent */
1036+ err = -EEXIST;
1037+ if (!ovl_dentry_lower(template->d_parent))
1038+ goto out_dput;
1039+
1040+ if (!S_ISLNK(newdentry->d_inode->i_mode))
1041+ goto out_dput;
1042+
1043+ err = -ENOMEM;
1044+ override_cred = prepare_creds();
1045+ if (!override_cred)
1046+ goto out_dput;
1047+
1048+ /*
1049+ * CAP_SYS_ADMIN for getxattr
1050+ * CAP_FOWNER for unlink in sticky directory
1051+ */
1052+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1053+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
1054+ old_cred = override_creds(override_cred);
1055+
1056+ err = -EEXIST;
1057+ if (ovl_is_whiteout(newdentry))
1058+ err = vfs_unlink(upperdir->d_inode, newdentry);
1059+
1060+ revert_creds(old_cred);
1061+ put_cred(override_cred);
1062+ if (err)
1063+ goto out_dput;
1064+
1065+ dput(newdentry);
1066+ newdentry = lookup_one_len(name->name, upperdir, name->len);
1067+ if (IS_ERR(newdentry)) {
1068+ ovl_whiteout(upperdir, template);
1069+ return newdentry;
1070+ }
1071+
1072+ /*
1073+ * Whiteout just been successfully removed, parent
1074+ * i_mutex is still held, there's no way the lookup
1075+ * could return positive.
1076+ */
1077+ WARN_ON(newdentry->d_inode);
1078+ }
1079+
1080+ return newdentry;
1081+
1082+out_dput:
1083+ dput(newdentry);
1084+ return ERR_PTR(err);
1085+}
1086+
1087+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1088+ struct kstat *stat, const char *link)
1089+{
1090+ int err;
1091+ struct dentry *newdentry;
1092+ struct inode *dir = upperdir->d_inode;
1093+
1094+ newdentry = ovl_lookup_create(upperdir, dentry);
1095+ if (IS_ERR(newdentry))
1096+ goto out;
1097+
1098+ switch (stat->mode & S_IFMT) {
1099+ case S_IFREG:
1100+ err = vfs_create(dir, newdentry, stat->mode, NULL);
1101+ break;
1102+
1103+ case S_IFDIR:
1104+ err = vfs_mkdir(dir, newdentry, stat->mode);
1105+ break;
1106+
1107+ case S_IFCHR:
1108+ case S_IFBLK:
1109+ case S_IFIFO:
1110+ case S_IFSOCK:
1111+ err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev);
1112+ break;
1113+
1114+ case S_IFLNK:
1115+ err = vfs_symlink(dir, newdentry, link);
1116+ break;
1117+
1118+ default:
1119+ err = -EPERM;
1120+ }
1121+ if (err) {
1122+ if (ovl_dentry_is_opaque(dentry))
1123+ ovl_whiteout(upperdir, dentry);
1124+ dput(newdentry);
1125+ newdentry = ERR_PTR(err);
1126+ } else if (WARN_ON(!newdentry->d_inode)) {
1127+ /*
1128+ * Not quite sure if non-instantiated dentry is legal or not.
1129+ * VFS doesn't seem to care so check and warn here.
1130+ */
1131+ dput(newdentry);
1132+ newdentry = ERR_PTR(-ENOENT);
1133+ }
1134+
1135+out:
1136+ return newdentry;
1137+
1138+}
1139+
1140+static int ovl_set_opaque(struct dentry *upperdentry)
1141+{
1142+ int err;
1143+ const struct cred *old_cred;
1144+ struct cred *override_cred;
1145+
1146+ override_cred = prepare_creds();
1147+ if (!override_cred)
1148+ return -ENOMEM;
1149+
1150+ /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */
1151+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1152+ old_cred = override_creds(override_cred);
1153+ err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
1154+ revert_creds(old_cred);
1155+ put_cred(override_cred);
1156+
1157+ return err;
1158+}
1159+
1160+static int ovl_remove_opaque(struct dentry *upperdentry)
1161+{
1162+ int err;
1163+ const struct cred *old_cred;
1164+ struct cred *override_cred;
1165+
1166+ override_cred = prepare_creds();
1167+ if (!override_cred)
1168+ return -ENOMEM;
1169+
1170+ /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */
1171+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1172+ old_cred = override_creds(override_cred);
1173+ err = vfs_removexattr(upperdentry, ovl_opaque_xattr);
1174+ revert_creds(old_cred);
1175+ put_cred(override_cred);
1176+
1177+ return err;
1178+}
1179+
1180+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
1181+ struct kstat *stat)
1182+{
1183+ int err;
1184+ enum ovl_path_type type;
1185+ struct path realpath;
1186+
1187+ type = ovl_path_real(dentry, &realpath);
1188+ err = vfs_getattr(realpath.mnt, realpath.dentry, stat);
1189+ if (err)
1190+ return err;
1191+
1192+ stat->dev = dentry->d_sb->s_dev;
1193+ stat->ino = dentry->d_inode->i_ino;
1194+
1195+ /*
1196+ * It's probably not worth it to count subdirs to get the
1197+ * correct link count. nlink=1 seems to pacify 'find' and
1198+ * other utilities.
1199+ */
1200+ if (type == OVL_PATH_MERGE)
1201+ stat->nlink = 1;
1202+
1203+ return 0;
1204+}
1205+
1206+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
1207+ const char *link)
1208+{
1209+ int err;
1210+ struct dentry *newdentry;
1211+ struct dentry *upperdir;
1212+ struct inode *inode;
1213+ struct kstat stat = {
1214+ .mode = mode,
1215+ .rdev = rdev,
1216+ };
1217+
1218+ err = -ENOMEM;
1219+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
1220+ if (!inode)
1221+ goto out;
1222+
1223+ err = ovl_copy_up(dentry->d_parent);
1224+ if (err)
1225+ goto out_iput;
1226+
1227+ upperdir = ovl_dentry_upper(dentry->d_parent);
1228+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1229+
1230+ newdentry = ovl_upper_create(upperdir, dentry, &stat, link);
1231+ err = PTR_ERR(newdentry);
1232+ if (IS_ERR(newdentry))
1233+ goto out_unlock;
1234+
1235+ ovl_dentry_version_inc(dentry->d_parent);
1236+ if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) {
1237+ err = ovl_set_opaque(newdentry);
1238+ if (err) {
1239+ vfs_rmdir(upperdir->d_inode, newdentry);
1240+ ovl_whiteout(upperdir, dentry);
1241+ goto out_dput;
1242+ }
1243+ }
1244+ ovl_dentry_update(dentry, newdentry);
1245+ ovl_copyattr(newdentry->d_inode, inode);
1246+ d_instantiate(dentry, inode);
1247+ inode = NULL;
1248+ newdentry = NULL;
1249+ err = 0;
1250+
1251+out_dput:
1252+ dput(newdentry);
1253+out_unlock:
1254+ mutex_unlock(&upperdir->d_inode->i_mutex);
1255+out_iput:
1256+ iput(inode);
1257+out:
1258+ return err;
1259+}
1260+
1261+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1262+ struct nameidata *nd)
1263+{
1264+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
1265+}
1266+
1267+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1268+{
1269+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
1270+}
1271+
1272+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
1273+ dev_t rdev)
1274+{
1275+ return ovl_create_object(dentry, mode, rdev, NULL);
1276+}
1277+
1278+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
1279+ const char *link)
1280+{
1281+ return ovl_create_object(dentry, S_IFLNK, 0, link);
1282+}
1283+
1284+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
1285+{
1286+ int err;
1287+ enum ovl_path_type type;
1288+ struct path realpath;
1289+ struct dentry *upperdir;
1290+
1291+ err = ovl_copy_up(dentry->d_parent);
1292+ if (err)
1293+ return err;
1294+
1295+ upperdir = ovl_dentry_upper(dentry->d_parent);
1296+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1297+ type = ovl_path_real(dentry, &realpath);
1298+ if (type != OVL_PATH_LOWER) {
1299+ err = -ESTALE;
1300+ if (realpath.dentry->d_parent != upperdir)
1301+ goto out_d_drop;
1302+
1303+ /* FIXME: create whiteout up front and rename to target */
1304+
1305+ if (is_dir)
1306+ err = vfs_rmdir(upperdir->d_inode, realpath.dentry);
1307+ else
1308+ err = vfs_unlink(upperdir->d_inode, realpath.dentry);
1309+ if (err)
1310+ goto out_d_drop;
1311+
1312+ ovl_dentry_version_inc(dentry->d_parent);
1313+ }
1314+
1315+ if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry))
1316+ err = ovl_whiteout(upperdir, dentry);
1317+
1318+ /*
1319+ * Keeping this dentry hashed would mean having to release
1320+ * upperpath/lowerpath, which could only be done if we are the
1321+ * sole user of this dentry. Too tricky... Just unhash for
1322+ * now.
1323+ */
1324+out_d_drop:
1325+ d_drop(dentry);
1326+ mutex_unlock(&upperdir->d_inode->i_mutex);
1327+
1328+ return err;
1329+}
1330+
1331+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
1332+{
1333+ return ovl_do_remove(dentry, false);
1334+}
1335+
1336+
1337+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
1338+{
1339+ int err;
1340+ enum ovl_path_type type;
1341+
1342+ type = ovl_path_type(dentry);
1343+ if (type != OVL_PATH_UPPER) {
1344+ err = ovl_check_empty_and_clear(dentry, type);
1345+ if (err)
1346+ return err;
1347+ }
1348+
1349+ return ovl_do_remove(dentry, true);
1350+}
1351+
1352+static int ovl_link(struct dentry *old, struct inode *newdir,
1353+ struct dentry *new)
1354+{
1355+ int err;
1356+ struct dentry *olddentry;
1357+ struct dentry *newdentry;
1358+ struct dentry *upperdir;
1359+ struct inode *newinode;
1360+
1361+ err = ovl_copy_up(old);
1362+ if (err)
1363+ goto out;
1364+
1365+ err = ovl_copy_up(new->d_parent);
1366+ if (err)
1367+ goto out;
1368+
1369+ upperdir = ovl_dentry_upper(new->d_parent);
1370+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1371+ newdentry = ovl_lookup_create(upperdir, new);
1372+ err = PTR_ERR(newdentry);
1373+ if (IS_ERR(newdentry))
1374+ goto out_unlock;
1375+
1376+ olddentry = ovl_dentry_upper(old);
1377+ err = vfs_link(olddentry, upperdir->d_inode, newdentry);
1378+ if (!err) {
1379+ if (WARN_ON(!newdentry->d_inode)) {
1380+ dput(newdentry);
1381+ err = -ENOENT;
1382+ goto out_unlock;
1383+ }
1384+ newinode = ovl_new_inode(old->d_sb, newdentry->d_inode->i_mode,
1385+ new->d_fsdata);
1386+ if (!newinode)
1387+ goto link_fail;
1388+ ovl_copyattr(upperdir->d_inode, newinode);
1389+
1390+ ovl_dentry_version_inc(new->d_parent);
1391+ ovl_dentry_update(new, newdentry);
1392+
1393+ d_instantiate(new, newinode);
1394+ } else {
1395+link_fail:
1396+ if (ovl_dentry_is_opaque(new))
1397+ ovl_whiteout(upperdir, new);
1398+ dput(newdentry);
1399+ }
1400+out_unlock:
1401+ mutex_unlock(&upperdir->d_inode->i_mutex);
1402+out:
1403+ return err;
1404+
1405+}
1406+
1407+static int ovl_rename(struct inode *olddir, struct dentry *old,
1408+ struct inode *newdir, struct dentry *new)
1409+{
1410+ int err;
1411+ enum ovl_path_type old_type;
1412+ enum ovl_path_type new_type;
1413+ struct dentry *old_upperdir;
1414+ struct dentry *new_upperdir;
1415+ struct dentry *olddentry;
1416+ struct dentry *newdentry;
1417+ struct dentry *trap;
1418+ bool old_opaque;
1419+ bool new_opaque;
1420+ bool new_create = false;
1421+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
1422+
1423+ /* Don't copy up directory trees */
1424+ old_type = ovl_path_type(old);
1425+ if (old_type != OVL_PATH_UPPER && is_dir)
1426+ return -EXDEV;
1427+
1428+ if (new->d_inode) {
1429+ new_type = ovl_path_type(new);
1430+
1431+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
1432+ if (ovl_dentry_lower(old)->d_inode ==
1433+ ovl_dentry_lower(new)->d_inode)
1434+ return 0;
1435+ }
1436+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
1437+ if (ovl_dentry_upper(old)->d_inode ==
1438+ ovl_dentry_upper(new)->d_inode)
1439+ return 0;
1440+ }
1441+
1442+ if (new_type != OVL_PATH_UPPER &&
1443+ S_ISDIR(new->d_inode->i_mode)) {
1444+ err = ovl_check_empty_and_clear(new, new_type);
1445+ if (err)
1446+ return err;
1447+ }
1448+ } else {
1449+ new_type = OVL_PATH_UPPER;
1450+ }
1451+
1452+ err = ovl_copy_up(old);
1453+ if (err)
1454+ return err;
1455+
1456+ err = ovl_copy_up(new->d_parent);
1457+ if (err)
1458+ return err;
1459+
1460+ old_upperdir = ovl_dentry_upper(old->d_parent);
1461+ new_upperdir = ovl_dentry_upper(new->d_parent);
1462+
1463+ trap = lock_rename(new_upperdir, old_upperdir);
1464+
1465+ olddentry = ovl_dentry_upper(old);
1466+ newdentry = ovl_dentry_upper(new);
1467+ if (newdentry) {
1468+ dget(newdentry);
1469+ } else {
1470+ new_create = true;
1471+ newdentry = ovl_lookup_create(new_upperdir, new);
1472+ err = PTR_ERR(newdentry);
1473+ if (IS_ERR(newdentry))
1474+ goto out_unlock;
1475+ }
1476+
1477+ err = -ESTALE;
1478+ if (olddentry->d_parent != old_upperdir)
1479+ goto out_dput;
1480+ if (newdentry->d_parent != new_upperdir)
1481+ goto out_dput;
1482+ if (olddentry == trap)
1483+ goto out_dput;
1484+ if (newdentry == trap)
1485+ goto out_dput;
1486+
1487+ old_opaque = ovl_dentry_is_opaque(old);
1488+ new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER;
1489+
1490+ if (is_dir && !old_opaque && new_opaque) {
1491+ err = ovl_set_opaque(olddentry);
1492+ if (err)
1493+ goto out_dput;
1494+ }
1495+
1496+ err = vfs_rename(old_upperdir->d_inode, olddentry,
1497+ new_upperdir->d_inode, newdentry);
1498+
1499+ if (err) {
1500+ if (new_create && ovl_dentry_is_opaque(new))
1501+ ovl_whiteout(new_upperdir, new);
1502+ if (is_dir && !old_opaque && new_opaque)
1503+ ovl_remove_opaque(olddentry);
1504+ goto out_dput;
1505+ }
1506+
1507+ if (old_type != OVL_PATH_UPPER || old_opaque)
1508+ err = ovl_whiteout(old_upperdir, old);
1509+ if (is_dir && old_opaque && !new_opaque)
1510+ ovl_remove_opaque(olddentry);
1511+
1512+ if (old_opaque != new_opaque)
1513+ ovl_dentry_set_opaque(old, new_opaque);
1514+
1515+ ovl_dentry_version_inc(old->d_parent);
1516+ ovl_dentry_version_inc(new->d_parent);
1517+
1518+out_dput:
1519+ dput(newdentry);
1520+out_unlock:
1521+ unlock_rename(new_upperdir, old_upperdir);
1522+ return err;
1523+}
1524+
1525+const struct inode_operations ovl_dir_inode_operations = {
1526+ .lookup = ovl_lookup,
1527+ .mkdir = ovl_mkdir,
1528+ .symlink = ovl_symlink,
1529+ .unlink = ovl_unlink,
1530+ .rmdir = ovl_rmdir,
1531+ .rename = ovl_rename,
1532+ .link = ovl_link,
1533+ .setattr = ovl_setattr,
1534+ .create = ovl_create,
1535+ .mknod = ovl_mknod,
1536+ .permission = ovl_permission,
1537+ .getattr = ovl_dir_getattr,
1538+ .setxattr = ovl_setxattr,
1539+ .getxattr = ovl_getxattr,
1540+ .listxattr = ovl_listxattr,
1541+ .removexattr = ovl_removexattr,
1542+};
1543diff -ruNb a//fs/overlayfs/inode.c b//fs/overlayfs/inode.c
1544--- a//fs/overlayfs/inode.c 1970-01-01 01:00:00.000000000 +0100
1545+++ b//fs/overlayfs/inode.c 2012-10-21 15:35:10.213032386 +0100
1546@@ -0,0 +1,375 @@
1547+/*
1548+ *
1549+ * Copyright (C) 2011 Novell Inc.
1550+ *
1551+ * This program is free software; you can redistribute it and/or modify it
1552+ * under the terms of the GNU General Public License version 2 as published by
1553+ * the Free Software Foundation.
1554+ */
1555+
1556+#include <linux/fs.h>
1557+#include <linux/slab.h>
1558+#include <linux/xattr.h>
1559+#include "overlayfs.h"
1560+
1561+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
1562+{
1563+ struct dentry *upperdentry;
1564+ int err;
1565+
1566+ if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry))
1567+ err = ovl_copy_up_truncate(dentry, attr->ia_size);
1568+ else
1569+ err = ovl_copy_up(dentry);
1570+ if (err)
1571+ return err;
1572+
1573+ upperdentry = ovl_dentry_upper(dentry);
1574+
1575+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
1576+ attr->ia_valid &= ~ATTR_MODE;
1577+
1578+ mutex_lock(&upperdentry->d_inode->i_mutex);
1579+ err = notify_change(upperdentry, attr);
1580+ mutex_unlock(&upperdentry->d_inode->i_mutex);
1581+
1582+ return err;
1583+}
1584+
1585+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
1586+ struct kstat *stat)
1587+{
1588+ struct path realpath;
1589+
1590+ ovl_path_real(dentry, &realpath);
1591+ return vfs_getattr(realpath.mnt, realpath.dentry, stat);
1592+}
1593+
1594+int ovl_permission(struct inode *inode, int mask)
1595+{
1596+ struct ovl_entry *oe;
1597+ struct dentry *alias = NULL;
1598+ struct inode *realinode;
1599+ struct dentry *realdentry;
1600+ bool is_upper;
1601+ int err;
1602+
1603+ if (S_ISDIR(inode->i_mode)) {
1604+ oe = inode->i_private;
1605+ } else if (mask & MAY_NOT_BLOCK) {
1606+ return -ECHILD;
1607+ } else {
1608+ /*
1609+ * For non-directories find an alias and get the info
1610+ * from there.
1611+ */
1612+ spin_lock(&inode->i_lock);
1613+ if (WARN_ON(list_empty(&inode->i_dentry))) {
1614+ spin_unlock(&inode->i_lock);
1615+ return -ENOENT;
1616+ }
1617+ alias = list_entry(inode->i_dentry.next,
1618+ struct dentry, d_alias);
1619+ dget(alias);
1620+ spin_unlock(&inode->i_lock);
1621+ oe = alias->d_fsdata;
1622+ }
1623+
1624+ realdentry = ovl_entry_real(oe, &is_upper);
1625+
1626+ /* Careful in RCU walk mode */
1627+ realinode = ACCESS_ONCE(realdentry->d_inode);
1628+ if (!realinode) {
1629+ WARN_ON(!(mask & MAY_NOT_BLOCK));
1630+ err = -ENOENT;
1631+ goto out_dput;
1632+ }
1633+
1634+ if (mask & MAY_WRITE) {
1635+ umode_t mode = realinode->i_mode;
1636+
1637+ /*
1638+ * Writes will always be redirected to upper layer, so
1639+ * ignore lower layer being read-only.
1640+ *
1641+ * If the overlay itself is read-only then proceed
1642+ * with the permission check, don't return EROFS.
1643+ * This will only happen if this is the lower layer of
1644+ * another overlayfs.
1645+ *
1646+ * If upper fs becomes read-only after the overlay was
1647+ * constructed return EROFS to prevent modification of
1648+ * upper layer.
1649+ */
1650+ err = -EROFS;
1651+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
1652+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1653+ goto out_dput;
1654+ }
1655+
1656+ err = inode_only_permission(realinode, mask);
1657+out_dput:
1658+ dput(alias);
1659+ return err;
1660+}
1661+
1662+
1663+struct ovl_link_data {
1664+ struct dentry *realdentry;
1665+ void *cookie;
1666+};
1667+
1668+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
1669+{
1670+ void *ret;
1671+ struct dentry *realdentry;
1672+ struct inode *realinode;
1673+
1674+ realdentry = ovl_dentry_real(dentry);
1675+ realinode = realdentry->d_inode;
1676+
1677+ if (WARN_ON(!realinode->i_op->follow_link))
1678+ return ERR_PTR(-EPERM);
1679+
1680+ ret = realinode->i_op->follow_link(realdentry, nd);
1681+ if (IS_ERR(ret))
1682+ return ret;
1683+
1684+ if (realinode->i_op->put_link) {
1685+ struct ovl_link_data *data;
1686+
1687+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
1688+ if (!data) {
1689+ realinode->i_op->put_link(realdentry, nd, ret);
1690+ return ERR_PTR(-ENOMEM);
1691+ }
1692+ data->realdentry = realdentry;
1693+ data->cookie = ret;
1694+
1695+ return data;
1696+ } else {
1697+ return NULL;
1698+ }
1699+}
1700+
1701+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1702+{
1703+ struct inode *realinode;
1704+ struct ovl_link_data *data = c;
1705+
1706+ if (!data)
1707+ return;
1708+
1709+ realinode = data->realdentry->d_inode;
1710+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
1711+ kfree(data);
1712+}
1713+
1714+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
1715+{
1716+ struct path realpath;
1717+ struct inode *realinode;
1718+
1719+ ovl_path_real(dentry, &realpath);
1720+ realinode = realpath.dentry->d_inode;
1721+
1722+ if (!realinode->i_op->readlink)
1723+ return -EINVAL;
1724+
1725+ touch_atime(&realpath);
1726+
1727+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
1728+}
1729+
1730+
1731+static bool ovl_is_private_xattr(const char *name)
1732+{
1733+ return strncmp(name, "trusted.overlay.", 14) == 0;
1734+}
1735+
1736+int ovl_setxattr(struct dentry *dentry, const char *name,
1737+ const void *value, size_t size, int flags)
1738+{
1739+ int err;
1740+ struct dentry *upperdentry;
1741+
1742+ if (ovl_is_private_xattr(name))
1743+ return -EPERM;
1744+
1745+ err = ovl_copy_up(dentry);
1746+ if (err)
1747+ return err;
1748+
1749+ upperdentry = ovl_dentry_upper(dentry);
1750+ return vfs_setxattr(upperdentry, name, value, size, flags);
1751+}
1752+
1753+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1754+ void *value, size_t size)
1755+{
1756+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1757+ ovl_is_private_xattr(name))
1758+ return -ENODATA;
1759+
1760+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
1761+}
1762+
1763+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
1764+{
1765+ ssize_t res;
1766+ int off;
1767+
1768+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
1769+ if (res <= 0 || size == 0)
1770+ return res;
1771+
1772+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
1773+ return res;
1774+
1775+ /* filter out private xattrs */
1776+ for (off = 0; off < res;) {
1777+ char *s = list + off;
1778+ size_t slen = strlen(s) + 1;
1779+
1780+ BUG_ON(off + slen > res);
1781+
1782+ if (ovl_is_private_xattr(s)) {
1783+ res -= slen;
1784+ memmove(s, s + slen, res - off);
1785+ } else {
1786+ off += slen;
1787+ }
1788+ }
1789+
1790+ return res;
1791+}
1792+
1793+int ovl_removexattr(struct dentry *dentry, const char *name)
1794+{
1795+ int err;
1796+ struct path realpath;
1797+ enum ovl_path_type type;
1798+
1799+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1800+ ovl_is_private_xattr(name))
1801+ return -ENODATA;
1802+
1803+ type = ovl_path_real(dentry, &realpath);
1804+ if (type == OVL_PATH_LOWER) {
1805+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
1806+ if (err < 0)
1807+ return err;
1808+
1809+ err = ovl_copy_up(dentry);
1810+ if (err)
1811+ return err;
1812+
1813+ ovl_path_upper(dentry, &realpath);
1814+ }
1815+
1816+ return vfs_removexattr(realpath.dentry, name);
1817+}
1818+
1819+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
1820+ struct dentry *realdentry)
1821+{
1822+ if (type != OVL_PATH_LOWER)
1823+ return false;
1824+
1825+ if (special_file(realdentry->d_inode->i_mode))
1826+ return false;
1827+
1828+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
1829+ return false;
1830+
1831+ return true;
1832+}
1833+
1834+static struct file *ovl_open(struct dentry *dentry, struct file *file,
1835+ const struct cred *cred)
1836+{
1837+ int err;
1838+ struct path realpath;
1839+ enum ovl_path_type type;
1840+
1841+ type = ovl_path_real(dentry, &realpath);
1842+ if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
1843+ if (file->f_flags & O_TRUNC)
1844+ err = ovl_copy_up_truncate(dentry, 0);
1845+ else
1846+ err = ovl_copy_up(dentry);
1847+ if (err)
1848+ return ERR_PTR(err);
1849+
1850+ ovl_path_upper(dentry, &realpath);
1851+ }
1852+
1853+ return vfs_open(&realpath, file, cred);
1854+}
1855+
1856+static const struct inode_operations ovl_file_inode_operations = {
1857+ .setattr = ovl_setattr,
1858+ .permission = ovl_permission,
1859+ .getattr = ovl_getattr,
1860+ .setxattr = ovl_setxattr,
1861+ .getxattr = ovl_getxattr,
1862+ .listxattr = ovl_listxattr,
1863+ .removexattr = ovl_removexattr,
1864+ .open = ovl_open,
1865+};
1866+
1867+static const struct inode_operations ovl_symlink_inode_operations = {
1868+ .setattr = ovl_setattr,
1869+ .follow_link = ovl_follow_link,
1870+ .put_link = ovl_put_link,
1871+ .readlink = ovl_readlink,
1872+ .getattr = ovl_getattr,
1873+ .setxattr = ovl_setxattr,
1874+ .getxattr = ovl_getxattr,
1875+ .listxattr = ovl_listxattr,
1876+ .removexattr = ovl_removexattr,
1877+};
1878+
1879+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1880+ struct ovl_entry *oe)
1881+{
1882+ struct inode *inode;
1883+
1884+ inode = new_inode(sb);
1885+ if (!inode)
1886+ return NULL;
1887+
1888+ mode &= S_IFMT;
1889+
1890+ inode->i_ino = get_next_ino();
1891+ inode->i_mode = mode;
1892+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
1893+
1894+ switch (mode) {
1895+ case S_IFDIR:
1896+ inode->i_private = oe;
1897+ inode->i_op = &ovl_dir_inode_operations;
1898+ inode->i_fop = &ovl_dir_operations;
1899+ break;
1900+
1901+ case S_IFLNK:
1902+ inode->i_op = &ovl_symlink_inode_operations;
1903+ break;
1904+
1905+ case S_IFREG:
1906+ case S_IFSOCK:
1907+ case S_IFBLK:
1908+ case S_IFCHR:
1909+ case S_IFIFO:
1910+ inode->i_op = &ovl_file_inode_operations;
1911+ break;
1912+
1913+ default:
1914+ WARN(1, "illegal file type: %i\n", mode);
1915+ iput(inode);
1916+ inode = NULL;
1917+ }
1918+
1919+ return inode;
1920+
1921+}
1922diff -ruNb a//fs/overlayfs/Kconfig b//fs/overlayfs/Kconfig
1923--- a//fs/overlayfs/Kconfig 1970-01-01 01:00:00.000000000 +0100
1924+++ b//fs/overlayfs/Kconfig 2012-10-21 15:33:23.868301470 +0100
1925@@ -0,0 +1,4 @@
1926+config OVERLAYFS_FS
1927+ tristate "Overlay filesystem support"
1928+ help
1929+ Add support for overlay filesystem.
1930diff -ruNb a//fs/overlayfs/Makefile b//fs/overlayfs/Makefile
1931--- a//fs/overlayfs/Makefile 1970-01-01 01:00:00.000000000 +0100
1932+++ b//fs/overlayfs/Makefile 2012-10-21 15:33:23.868301470 +0100
1933@@ -0,0 +1,7 @@
1934+#
1935+# Makefile for the overlay filesystem.
1936+#
1937+
1938+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
1939+
1940+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
1941diff -ruNb a//fs/overlayfs/overlayfs.h b//fs/overlayfs/overlayfs.h
1942--- a//fs/overlayfs/overlayfs.h 1970-01-01 01:00:00.000000000 +0100
1943+++ b//fs/overlayfs/overlayfs.h 2012-10-21 15:35:40.472972180 +0100
1944@@ -0,0 +1,70 @@
1945+/*
1946+ *
1947+ * Copyright (C) 2011 Novell Inc.
1948+ *
1949+ * This program is free software; you can redistribute it and/or modify it
1950+ * under the terms of the GNU General Public License version 2 as published by
1951+ * the Free Software Foundation.
1952+ */
1953+
1954+struct ovl_entry;
1955+
1956+enum ovl_path_type {
1957+ OVL_PATH_UPPER,
1958+ OVL_PATH_MERGE,
1959+ OVL_PATH_LOWER,
1960+};
1961+
1962+extern const char *ovl_opaque_xattr;
1963+extern const char *ovl_whiteout_xattr;
1964+extern const struct dentry_operations ovl_dentry_operations;
1965+
1966+enum ovl_path_type ovl_path_type(struct dentry *dentry);
1967+u64 ovl_dentry_version_get(struct dentry *dentry);
1968+void ovl_dentry_version_inc(struct dentry *dentry);
1969+void ovl_path_upper(struct dentry *dentry, struct path *path);
1970+void ovl_path_lower(struct dentry *dentry, struct path *path);
1971+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
1972+struct dentry *ovl_dentry_upper(struct dentry *dentry);
1973+struct dentry *ovl_dentry_lower(struct dentry *dentry);
1974+struct dentry *ovl_dentry_real(struct dentry *dentry);
1975+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
1976+bool ovl_dentry_is_opaque(struct dentry *dentry);
1977+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
1978+bool ovl_is_whiteout(struct dentry *dentry);
1979+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
1980+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
1981+ struct nameidata *nd);
1982+struct file *ovl_path_open(struct path *path, int flags);
1983+
1984+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1985+ struct kstat *stat, const char *link);
1986+
1987+/* readdir.c */
1988+extern const struct file_operations ovl_dir_operations;
1989+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type);
1990+
1991+/* inode.c */
1992+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
1993+int ovl_permission(struct inode *inode, int mask);
1994+int ovl_setxattr(struct dentry *dentry, const char *name,
1995+ const void *value, size_t size, int flags);
1996+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1997+ void *value, size_t size);
1998+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
1999+int ovl_removexattr(struct dentry *dentry, const char *name);
2000+
2001+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
2002+ struct ovl_entry *oe);
2003+static inline void ovl_copyattr(struct inode *from, struct inode *to)
2004+{
2005+ to->i_uid = from->i_uid;
2006+ to->i_gid = from->i_gid;
2007+}
2008+
2009+/* dir.c */
2010+extern const struct inode_operations ovl_dir_inode_operations;
2011+
2012+/* copy_up.c */
2013+int ovl_copy_up(struct dentry *dentry);
2014+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size);
2015diff -ruNb a//fs/overlayfs/readdir.c b//fs/overlayfs/readdir.c
2016--- a//fs/overlayfs/readdir.c 1970-01-01 01:00:00.000000000 +0100
2017+++ b//fs/overlayfs/readdir.c 2012-10-21 15:33:23.870301202 +0100
2018@@ -0,0 +1,566 @@
2019+/*
2020+ *
2021+ * Copyright (C) 2011 Novell Inc.
2022+ *
2023+ * This program is free software; you can redistribute it and/or modify it
2024+ * under the terms of the GNU General Public License version 2 as published by
2025+ * the Free Software Foundation.
2026+ */
2027+
2028+#include <linux/fs.h>
2029+#include <linux/slab.h>
2030+#include <linux/namei.h>
2031+#include <linux/file.h>
2032+#include <linux/xattr.h>
2033+#include <linux/rbtree.h>
2034+#include <linux/security.h>
2035+#include <linux/cred.h>
2036+#include "overlayfs.h"
2037+
2038+struct ovl_cache_entry {
2039+ const char *name;
2040+ unsigned int len;
2041+ unsigned int type;
2042+ u64 ino;
2043+ bool is_whiteout;
2044+ struct list_head l_node;
2045+ struct rb_node node;
2046+};
2047+
2048+struct ovl_readdir_data {
2049+ struct rb_root *root;
2050+ struct list_head *list;
2051+ struct list_head *middle;
2052+ struct dentry *dir;
2053+ int count;
2054+ int err;
2055+};
2056+
2057+struct ovl_dir_file {
2058+ bool is_real;
2059+ bool is_cached;
2060+ struct list_head cursor;
2061+ u64 cache_version;
2062+ struct list_head cache;
2063+ struct file *realfile;
2064+};
2065+
2066+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
2067+{
2068+ return container_of(n, struct ovl_cache_entry, node);
2069+}
2070+
2071+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
2072+ const char *name, int len)
2073+{
2074+ struct rb_node *node = root->rb_node;
2075+ int cmp;
2076+
2077+ while (node) {
2078+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
2079+
2080+ cmp = strncmp(name, p->name, len);
2081+ if (cmp > 0)
2082+ node = p->node.rb_right;
2083+ else if (cmp < 0 || len < p->len)
2084+ node = p->node.rb_left;
2085+ else
2086+ return p;
2087+ }
2088+
2089+ return NULL;
2090+}
2091+
2092+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
2093+ u64 ino, unsigned int d_type)
2094+{
2095+ struct ovl_cache_entry *p;
2096+
2097+ p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
2098+ if (p) {
2099+ char *name_copy = (char *) (p + 1);
2100+ memcpy(name_copy, name, len);
2101+ name_copy[len] = '\0';
2102+ p->name = name_copy;
2103+ p->len = len;
2104+ p->type = d_type;
2105+ p->ino = ino;
2106+ p->is_whiteout = false;
2107+ }
2108+
2109+ return p;
2110+}
2111+
2112+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
2113+ const char *name, int len, u64 ino,
2114+ unsigned int d_type)
2115+{
2116+ struct rb_node **newp = &rdd->root->rb_node;
2117+ struct rb_node *parent = NULL;
2118+ struct ovl_cache_entry *p;
2119+
2120+ while (*newp) {
2121+ int cmp;
2122+ struct ovl_cache_entry *tmp;
2123+
2124+ parent = *newp;
2125+ tmp = ovl_cache_entry_from_node(*newp);
2126+ cmp = strncmp(name, tmp->name, len);
2127+ if (cmp > 0)
2128+ newp = &tmp->node.rb_right;
2129+ else if (cmp < 0 || len < tmp->len)
2130+ newp = &tmp->node.rb_left;
2131+ else
2132+ return 0;
2133+ }
2134+
2135+ p = ovl_cache_entry_new(name, len, ino, d_type);
2136+ if (p == NULL)
2137+ return -ENOMEM;
2138+
2139+ list_add_tail(&p->l_node, rdd->list);
2140+ rb_link_node(&p->node, parent, newp);
2141+ rb_insert_color(&p->node, rdd->root);
2142+
2143+ return 0;
2144+}
2145+
2146+static int ovl_fill_lower(void *buf, const char *name, int namelen,
2147+ loff_t offset, u64 ino, unsigned int d_type)
2148+{
2149+ struct ovl_readdir_data *rdd = buf;
2150+ struct ovl_cache_entry *p;
2151+
2152+ rdd->count++;
2153+ p = ovl_cache_entry_find(rdd->root, name, namelen);
2154+ if (p) {
2155+ list_move_tail(&p->l_node, rdd->middle);
2156+ } else {
2157+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
2158+ if (p == NULL)
2159+ rdd->err = -ENOMEM;
2160+ else
2161+ list_add_tail(&p->l_node, rdd->middle);
2162+ }
2163+
2164+ return rdd->err;
2165+}
2166+
2167+static void ovl_cache_free(struct list_head *list)
2168+{
2169+ struct ovl_cache_entry *p;
2170+ struct ovl_cache_entry *n;
2171+
2172+ list_for_each_entry_safe(p, n, list, l_node)
2173+ kfree(p);
2174+
2175+ INIT_LIST_HEAD(list);
2176+}
2177+
2178+static int ovl_fill_upper(void *buf, const char *name, int namelen,
2179+ loff_t offset, u64 ino, unsigned int d_type)
2180+{
2181+ struct ovl_readdir_data *rdd = buf;
2182+
2183+ rdd->count++;
2184+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
2185+}
2186+
2187+static inline int ovl_dir_read(struct path *realpath,
2188+ struct ovl_readdir_data *rdd, filldir_t filler)
2189+{
2190+ struct file *realfile;
2191+ int err;
2192+
2193+ realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
2194+ if (IS_ERR(realfile))
2195+ return PTR_ERR(realfile);
2196+
2197+ do {
2198+ rdd->count = 0;
2199+ rdd->err = 0;
2200+ err = vfs_readdir(realfile, filler, rdd);
2201+ if (err >= 0)
2202+ err = rdd->err;
2203+ } while (!err && rdd->count);
2204+ fput(realfile);
2205+
2206+ return 0;
2207+}
2208+
2209+static void ovl_dir_reset(struct file *file)
2210+{
2211+ struct ovl_dir_file *od = file->private_data;
2212+ enum ovl_path_type type = ovl_path_type(file->f_path.dentry);
2213+
2214+ if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) {
2215+ list_del_init(&od->cursor);
2216+ ovl_cache_free(&od->cache);
2217+ od->is_cached = false;
2218+ }
2219+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
2220+ if (od->is_real && type == OVL_PATH_MERGE) {
2221+ fput(od->realfile);
2222+ od->realfile = NULL;
2223+ od->is_real = false;
2224+ }
2225+}
2226+
2227+static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd)
2228+{
2229+ struct ovl_cache_entry *p;
2230+ struct dentry *dentry;
2231+ const struct cred *old_cred;
2232+ struct cred *override_cred;
2233+
2234+ override_cred = prepare_creds();
2235+ if (!override_cred) {
2236+ ovl_cache_free(rdd->list);
2237+ return -ENOMEM;
2238+ }
2239+
2240+ /*
2241+ * CAP_SYS_ADMIN for getxattr
2242+ * CAP_DAC_OVERRIDE for lookup
2243+ */
2244+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2245+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2246+ old_cred = override_creds(override_cred);
2247+
2248+ mutex_lock(&rdd->dir->d_inode->i_mutex);
2249+ list_for_each_entry(p, rdd->list, l_node) {
2250+ if (p->type != DT_LNK)
2251+ continue;
2252+
2253+ dentry = lookup_one_len(p->name, rdd->dir, p->len);
2254+ if (IS_ERR(dentry))
2255+ continue;
2256+
2257+ p->is_whiteout = ovl_is_whiteout(dentry);
2258+ dput(dentry);
2259+ }
2260+ mutex_unlock(&rdd->dir->d_inode->i_mutex);
2261+
2262+ revert_creds(old_cred);
2263+ put_cred(override_cred);
2264+
2265+ return 0;
2266+}
2267+
2268+static inline int ovl_dir_read_merged(struct path *upperpath,
2269+ struct path *lowerpath,
2270+ struct ovl_readdir_data *rdd)
2271+{
2272+ int err;
2273+ struct rb_root root = RB_ROOT;
2274+ struct list_head middle;
2275+
2276+ rdd->root = &root;
2277+ if (upperpath->dentry) {
2278+ rdd->dir = upperpath->dentry;
2279+ err = ovl_dir_read(upperpath, rdd, ovl_fill_upper);
2280+ if (err)
2281+ goto out;
2282+
2283+ err = ovl_dir_mark_whiteouts(rdd);
2284+ if (err)
2285+ goto out;
2286+ }
2287+ /*
2288+ * Insert lowerpath entries before upperpath ones, this allows
2289+ * offsets to be reasonably constant
2290+ */
2291+ list_add(&middle, rdd->list);
2292+ rdd->middle = &middle;
2293+ err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower);
2294+ list_del(&middle);
2295+out:
2296+ rdd->root = NULL;
2297+
2298+ return err;
2299+}
2300+
2301+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
2302+{
2303+ struct list_head *l;
2304+ loff_t off;
2305+
2306+ l = od->cache.next;
2307+ for (off = 0; off < pos; off++) {
2308+ if (l == &od->cache)
2309+ break;
2310+ l = l->next;
2311+ }
2312+ list_move_tail(&od->cursor, l);
2313+}
2314+
2315+static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
2316+{
2317+ struct ovl_dir_file *od = file->private_data;
2318+ int res;
2319+
2320+ if (!file->f_pos)
2321+ ovl_dir_reset(file);
2322+
2323+ if (od->is_real) {
2324+ res = vfs_readdir(od->realfile, filler, buf);
2325+ file->f_pos = od->realfile->f_pos;
2326+
2327+ return res;
2328+ }
2329+
2330+ if (!od->is_cached) {
2331+ struct path lowerpath;
2332+ struct path upperpath;
2333+ struct ovl_readdir_data rdd = { .list = &od->cache };
2334+
2335+ ovl_path_lower(file->f_path.dentry, &lowerpath);
2336+ ovl_path_upper(file->f_path.dentry, &upperpath);
2337+
2338+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2339+ if (res) {
2340+ ovl_cache_free(rdd.list);
2341+ return res;
2342+ }
2343+
2344+ od->cache_version = ovl_dentry_version_get(file->f_path.dentry);
2345+ od->is_cached = true;
2346+
2347+ ovl_seek_cursor(od, file->f_pos);
2348+ }
2349+
2350+ while (od->cursor.next != &od->cache) {
2351+ int over;
2352+ loff_t off;
2353+ struct ovl_cache_entry *p;
2354+
2355+ p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node);
2356+ off = file->f_pos;
2357+ if (!p->is_whiteout) {
2358+ over = filler(buf, p->name, p->len, off, p->ino,
2359+ p->type);
2360+ if (over)
2361+ break;
2362+ }
2363+ file->f_pos++;
2364+ list_move(&od->cursor, &p->l_node);
2365+ }
2366+
2367+ return 0;
2368+}
2369+
2370+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
2371+{
2372+ loff_t res;
2373+ struct ovl_dir_file *od = file->private_data;
2374+
2375+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
2376+ if (!file->f_pos)
2377+ ovl_dir_reset(file);
2378+
2379+ if (od->is_real) {
2380+ res = vfs_llseek(od->realfile, offset, origin);
2381+ file->f_pos = od->realfile->f_pos;
2382+ } else {
2383+ res = -EINVAL;
2384+
2385+ switch (origin) {
2386+ case SEEK_CUR:
2387+ offset += file->f_pos;
2388+ break;
2389+ case SEEK_SET:
2390+ break;
2391+ default:
2392+ goto out_unlock;
2393+ }
2394+ if (offset < 0)
2395+ goto out_unlock;
2396+
2397+ if (offset != file->f_pos) {
2398+ file->f_pos = offset;
2399+ if (od->is_cached)
2400+ ovl_seek_cursor(od, offset);
2401+ }
2402+ res = offset;
2403+ }
2404+out_unlock:
2405+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
2406+
2407+ return res;
2408+}
2409+
2410+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
2411+ int datasync)
2412+{
2413+ struct ovl_dir_file *od = file->private_data;
2414+
2415+ /* May need to reopen directory if it got copied up */
2416+ if (!od->realfile) {
2417+ struct path upperpath;
2418+
2419+ ovl_path_upper(file->f_path.dentry, &upperpath);
2420+ od->realfile = ovl_path_open(&upperpath, O_RDONLY);
2421+ if (IS_ERR(od->realfile))
2422+ return PTR_ERR(od->realfile);
2423+ }
2424+
2425+ return vfs_fsync_range(od->realfile, start, end, datasync);
2426+}
2427+
2428+static int ovl_dir_release(struct inode *inode, struct file *file)
2429+{
2430+ struct ovl_dir_file *od = file->private_data;
2431+
2432+ list_del(&od->cursor);
2433+ ovl_cache_free(&od->cache);
2434+ if (od->realfile)
2435+ fput(od->realfile);
2436+ kfree(od);
2437+
2438+ return 0;
2439+}
2440+
2441+static int ovl_dir_open(struct inode *inode, struct file *file)
2442+{
2443+ struct path realpath;
2444+ struct file *realfile;
2445+ struct ovl_dir_file *od;
2446+ enum ovl_path_type type;
2447+
2448+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
2449+ if (!od)
2450+ return -ENOMEM;
2451+
2452+ type = ovl_path_real(file->f_path.dentry, &realpath);
2453+ realfile = ovl_path_open(&realpath, file->f_flags);
2454+ if (IS_ERR(realfile)) {
2455+ kfree(od);
2456+ return PTR_ERR(realfile);
2457+ }
2458+ INIT_LIST_HEAD(&od->cache);
2459+ INIT_LIST_HEAD(&od->cursor);
2460+ od->is_cached = false;
2461+ od->realfile = realfile;
2462+ od->is_real = (type != OVL_PATH_MERGE);
2463+ file->private_data = od;
2464+
2465+ return 0;
2466+}
2467+
2468+const struct file_operations ovl_dir_operations = {
2469+ .read = generic_read_dir,
2470+ .open = ovl_dir_open,
2471+ .readdir = ovl_readdir,
2472+ .llseek = ovl_dir_llseek,
2473+ .fsync = ovl_dir_fsync,
2474+ .release = ovl_dir_release,
2475+};
2476+
2477+static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
2478+{
2479+ int err;
2480+ struct path lowerpath;
2481+ struct path upperpath;
2482+ struct ovl_cache_entry *p;
2483+ struct ovl_readdir_data rdd = { .list = list };
2484+
2485+ ovl_path_upper(dentry, &upperpath);
2486+ ovl_path_lower(dentry, &lowerpath);
2487+
2488+ err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2489+ if (err)
2490+ return err;
2491+
2492+ err = 0;
2493+
2494+ list_for_each_entry(p, list, l_node) {
2495+ if (p->is_whiteout)
2496+ continue;
2497+
2498+ if (p->name[0] == '.') {
2499+ if (p->len == 1)
2500+ continue;
2501+ if (p->len == 2 && p->name[1] == '.')
2502+ continue;
2503+ }
2504+ err = -ENOTEMPTY;
2505+ break;
2506+ }
2507+
2508+ return err;
2509+}
2510+
2511+static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list)
2512+{
2513+ struct path upperpath;
2514+ struct dentry *upperdir;
2515+ struct ovl_cache_entry *p;
2516+ const struct cred *old_cred;
2517+ struct cred *override_cred;
2518+ int err;
2519+
2520+ ovl_path_upper(dir, &upperpath);
2521+ upperdir = upperpath.dentry;
2522+
2523+ override_cred = prepare_creds();
2524+ if (!override_cred)
2525+ return -ENOMEM;
2526+
2527+ /*
2528+ * CAP_DAC_OVERRIDE for lookup and unlink
2529+ * CAP_SYS_ADMIN for setxattr of "trusted" namespace
2530+ * CAP_FOWNER for unlink in sticky directory
2531+ */
2532+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2533+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2534+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
2535+ old_cred = override_creds(override_cred);
2536+
2537+ err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0);
2538+ if (err)
2539+ goto out_revert_creds;
2540+
2541+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2542+ list_for_each_entry(p, list, l_node) {
2543+ struct dentry *dentry;
2544+ int ret;
2545+
2546+ if (!p->is_whiteout)
2547+ continue;
2548+
2549+ dentry = lookup_one_len(p->name, upperdir, p->len);
2550+ if (IS_ERR(dentry)) {
2551+ printk(KERN_WARNING
2552+ "overlayfs: failed to lookup whiteout %.*s: %li\n",
2553+ p->len, p->name, PTR_ERR(dentry));
2554+ continue;
2555+ }
2556+ ret = vfs_unlink(upperdir->d_inode, dentry);
2557+ dput(dentry);
2558+ if (ret)
2559+ printk(KERN_WARNING
2560+ "overlayfs: failed to unlink whiteout %.*s: %i\n",
2561+ p->len, p->name, ret);
2562+ }
2563+ mutex_unlock(&upperdir->d_inode->i_mutex);
2564+
2565+out_revert_creds:
2566+ revert_creds(old_cred);
2567+ put_cred(override_cred);
2568+
2569+ return err;
2570+}
2571+
2572+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type)
2573+{
2574+ int err;
2575+ LIST_HEAD(list);
2576+
2577+ err = ovl_check_empty_dir(dentry, &list);
2578+ if (!err && type == OVL_PATH_MERGE)
2579+ err = ovl_remove_whiteouts(dentry, &list);
2580+
2581+ ovl_cache_free(&list);
2582+
2583+ return err;
2584+}
2585diff -ruNb a//fs/overlayfs/super.c b//fs/overlayfs/super.c
2586--- a//fs/overlayfs/super.c 1970-01-01 01:00:00.000000000 +0100
2587+++ b//fs/overlayfs/super.c 2012-10-21 15:35:40.473972046 +0100
2588@@ -0,0 +1,665 @@
2589+/*
2590+ *
2591+ * Copyright (C) 2011 Novell Inc.
2592+ *
2593+ * This program is free software; you can redistribute it and/or modify it
2594+ * under the terms of the GNU General Public License version 2 as published by
2595+ * the Free Software Foundation.
2596+ */
2597+
2598+#include <linux/fs.h>
2599+#include <linux/namei.h>
2600+#include <linux/xattr.h>
2601+#include <linux/security.h>
2602+#include <linux/mount.h>
2603+#include <linux/slab.h>
2604+#include <linux/parser.h>
2605+#include <linux/module.h>
2606+#include <linux/cred.h>
2607+#include <linux/sched.h>
2608+#include <linux/seq_file.h>
2609+#include "overlayfs.h"
2610+
2611+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
2612+MODULE_DESCRIPTION("Overlay filesystem");
2613+MODULE_LICENSE("GPL");
2614+
2615+struct ovl_config {
2616+ char *lowerdir;
2617+ char *upperdir;
2618+};
2619+
2620+/* private information held for overlayfs's superblock */
2621+struct ovl_fs {
2622+ struct vfsmount *upper_mnt;
2623+ struct vfsmount *lower_mnt;
2624+ /* pathnames of lower and upper dirs, for show_options */
2625+ struct ovl_config config;
2626+};
2627+
2628+/* private information held for every overlayfs dentry */
2629+struct ovl_entry {
2630+ /*
2631+ * Keep "double reference" on upper dentries, so that
2632+ * d_delete() doesn't think it's OK to reset d_inode to NULL.
2633+ */
2634+ struct dentry *__upperdentry;
2635+ struct dentry *lowerdentry;
2636+ union {
2637+ struct {
2638+ u64 version;
2639+ bool opaque;
2640+ };
2641+ struct rcu_head rcu;
2642+ };
2643+};
2644+
2645+const char *ovl_whiteout_xattr = "trusted.overlay.whiteout";
2646+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
2647+
2648+
2649+enum ovl_path_type ovl_path_type(struct dentry *dentry)
2650+{
2651+ struct ovl_entry *oe = dentry->d_fsdata;
2652+
2653+ if (oe->__upperdentry) {
2654+ if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode))
2655+ return OVL_PATH_MERGE;
2656+ else
2657+ return OVL_PATH_UPPER;
2658+ } else {
2659+ return OVL_PATH_LOWER;
2660+ }
2661+}
2662+
2663+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
2664+{
2665+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
2666+ smp_read_barrier_depends();
2667+ return upperdentry;
2668+}
2669+
2670+void ovl_path_upper(struct dentry *dentry, struct path *path)
2671+{
2672+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2673+ struct ovl_entry *oe = dentry->d_fsdata;
2674+
2675+ path->mnt = ofs->upper_mnt;
2676+ path->dentry = ovl_upperdentry_dereference(oe);
2677+}
2678+
2679+void ovl_path_lower(struct dentry *dentry, struct path *path)
2680+{
2681+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2682+ struct ovl_entry *oe = dentry->d_fsdata;
2683+
2684+ path->mnt = ofs->lower_mnt;
2685+ path->dentry = oe->lowerdentry;
2686+}
2687+
2688+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
2689+{
2690+
2691+ enum ovl_path_type type = ovl_path_type(dentry);
2692+
2693+ if (type == OVL_PATH_LOWER)
2694+ ovl_path_lower(dentry, path);
2695+ else
2696+ ovl_path_upper(dentry, path);
2697+
2698+ return type;
2699+}
2700+
2701+struct dentry *ovl_dentry_upper(struct dentry *dentry)
2702+{
2703+ struct ovl_entry *oe = dentry->d_fsdata;
2704+
2705+ return ovl_upperdentry_dereference(oe);
2706+}
2707+
2708+struct dentry *ovl_dentry_lower(struct dentry *dentry)
2709+{
2710+ struct ovl_entry *oe = dentry->d_fsdata;
2711+
2712+ return oe->lowerdentry;
2713+}
2714+
2715+struct dentry *ovl_dentry_real(struct dentry *dentry)
2716+{
2717+ struct ovl_entry *oe = dentry->d_fsdata;
2718+ struct dentry *realdentry;
2719+
2720+ realdentry = ovl_upperdentry_dereference(oe);
2721+ if (!realdentry)
2722+ realdentry = oe->lowerdentry;
2723+
2724+ return realdentry;
2725+}
2726+
2727+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
2728+{
2729+ struct dentry *realdentry;
2730+
2731+ realdentry = ovl_upperdentry_dereference(oe);
2732+ if (realdentry) {
2733+ *is_upper = true;
2734+ } else {
2735+ realdentry = oe->lowerdentry;
2736+ *is_upper = false;
2737+ }
2738+ return realdentry;
2739+}
2740+
2741+bool ovl_dentry_is_opaque(struct dentry *dentry)
2742+{
2743+ struct ovl_entry *oe = dentry->d_fsdata;
2744+ return oe->opaque;
2745+}
2746+
2747+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
2748+{
2749+ struct ovl_entry *oe = dentry->d_fsdata;
2750+ oe->opaque = opaque;
2751+}
2752+
2753+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
2754+{
2755+ struct ovl_entry *oe = dentry->d_fsdata;
2756+
2757+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
2758+ WARN_ON(oe->__upperdentry);
2759+ BUG_ON(!upperdentry->d_inode);
2760+ smp_wmb();
2761+ oe->__upperdentry = dget(upperdentry);
2762+}
2763+
2764+void ovl_dentry_version_inc(struct dentry *dentry)
2765+{
2766+ struct ovl_entry *oe = dentry->d_fsdata;
2767+
2768+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2769+ oe->version++;
2770+}
2771+
2772+u64 ovl_dentry_version_get(struct dentry *dentry)
2773+{
2774+ struct ovl_entry *oe = dentry->d_fsdata;
2775+
2776+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2777+ return oe->version;
2778+}
2779+
2780+bool ovl_is_whiteout(struct dentry *dentry)
2781+{
2782+ int res;
2783+ char val;
2784+
2785+ if (!dentry)
2786+ return false;
2787+ if (!dentry->d_inode)
2788+ return false;
2789+ if (!S_ISLNK(dentry->d_inode->i_mode))
2790+ return false;
2791+
2792+ res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1);
2793+ if (res == 1 && val == 'y')
2794+ return true;
2795+
2796+ return false;
2797+}
2798+
2799+static bool ovl_is_opaquedir(struct dentry *dentry)
2800+{
2801+ int res;
2802+ char val;
2803+
2804+ if (!S_ISDIR(dentry->d_inode->i_mode))
2805+ return false;
2806+
2807+ res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1);
2808+ if (res == 1 && val == 'y')
2809+ return true;
2810+
2811+ return false;
2812+}
2813+
2814+static void ovl_entry_free(struct rcu_head *head)
2815+{
2816+ struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu);
2817+ kfree(oe);
2818+}
2819+
2820+static void ovl_dentry_release(struct dentry *dentry)
2821+{
2822+ struct ovl_entry *oe = dentry->d_fsdata;
2823+
2824+ if (oe) {
2825+ dput(oe->__upperdentry);
2826+ dput(oe->__upperdentry);
2827+ dput(oe->lowerdentry);
2828+ call_rcu(&oe->rcu, ovl_entry_free);
2829+ }
2830+}
2831+
2832+const struct dentry_operations ovl_dentry_operations = {
2833+ .d_release = ovl_dentry_release,
2834+};
2835+
2836+static struct ovl_entry *ovl_alloc_entry(void)
2837+{
2838+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
2839+}
2840+
2841+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
2842+ struct qstr *name)
2843+{
2844+ struct dentry *dentry;
2845+
2846+ mutex_lock(&dir->d_inode->i_mutex);
2847+ dentry = lookup_one_len(name->name, dir, name->len);
2848+ mutex_unlock(&dir->d_inode->i_mutex);
2849+
2850+ if (IS_ERR(dentry)) {
2851+ if (PTR_ERR(dentry) == -ENOENT)
2852+ dentry = NULL;
2853+ } else if (!dentry->d_inode) {
2854+ dput(dentry);
2855+ dentry = NULL;
2856+ }
2857+ return dentry;
2858+}
2859+
2860+static int ovl_do_lookup(struct dentry *dentry)
2861+{
2862+ struct ovl_entry *oe;
2863+ struct dentry *upperdir;
2864+ struct dentry *lowerdir;
2865+ struct dentry *upperdentry = NULL;
2866+ struct dentry *lowerdentry = NULL;
2867+ struct inode *inode = NULL;
2868+ int err;
2869+
2870+ err = -ENOMEM;
2871+ oe = ovl_alloc_entry();
2872+ if (!oe)
2873+ goto out;
2874+
2875+ upperdir = ovl_dentry_upper(dentry->d_parent);
2876+ lowerdir = ovl_dentry_lower(dentry->d_parent);
2877+
2878+ if (upperdir) {
2879+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
2880+ err = PTR_ERR(upperdentry);
2881+ if (IS_ERR(upperdentry))
2882+ goto out_put_dir;
2883+
2884+ if (lowerdir && upperdentry &&
2885+ (S_ISLNK(upperdentry->d_inode->i_mode) ||
2886+ S_ISDIR(upperdentry->d_inode->i_mode))) {
2887+ const struct cred *old_cred;
2888+ struct cred *override_cred;
2889+
2890+ err = -ENOMEM;
2891+ override_cred = prepare_creds();
2892+ if (!override_cred)
2893+ goto out_dput_upper;
2894+
2895+ /* CAP_SYS_ADMIN needed for getxattr */
2896+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2897+ old_cred = override_creds(override_cred);
2898+
2899+ if (ovl_is_opaquedir(upperdentry)) {
2900+ oe->opaque = true;
2901+ } else if (ovl_is_whiteout(upperdentry)) {
2902+ dput(upperdentry);
2903+ upperdentry = NULL;
2904+ oe->opaque = true;
2905+ }
2906+ revert_creds(old_cred);
2907+ put_cred(override_cred);
2908+ }
2909+ }
2910+ if (lowerdir && !oe->opaque) {
2911+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
2912+ err = PTR_ERR(lowerdentry);
2913+ if (IS_ERR(lowerdentry))
2914+ goto out_dput_upper;
2915+ }
2916+
2917+ if (lowerdentry && upperdentry &&
2918+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
2919+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
2920+ dput(lowerdentry);
2921+ lowerdentry = NULL;
2922+ oe->opaque = true;
2923+ }
2924+
2925+ if (lowerdentry || upperdentry) {
2926+ struct dentry *realdentry;
2927+
2928+ realdentry = upperdentry ? upperdentry : lowerdentry;
2929+ err = -ENOMEM;
2930+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
2931+ oe);
2932+ if (!inode)
2933+ goto out_dput;
2934+ ovl_copyattr(realdentry->d_inode, inode);
2935+ }
2936+
2937+ if (upperdentry)
2938+ oe->__upperdentry = dget(upperdentry);
2939+
2940+ if (lowerdentry)
2941+ oe->lowerdentry = lowerdentry;
2942+
2943+ dentry->d_fsdata = oe;
2944+ dentry->d_op = &ovl_dentry_operations;
2945+ d_add(dentry, inode);
2946+
2947+ return 0;
2948+
2949+out_dput:
2950+ dput(lowerdentry);
2951+out_dput_upper:
2952+ dput(upperdentry);
2953+out_put_dir:
2954+ kfree(oe);
2955+out:
2956+ return err;
2957+}
2958+
2959+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
2960+ struct nameidata *nd)
2961+{
2962+ int err = ovl_do_lookup(dentry);
2963+
2964+ if (err)
2965+ return ERR_PTR(err);
2966+
2967+ return NULL;
2968+}
2969+
2970+struct file *ovl_path_open(struct path *path, int flags)
2971+{
2972+ path_get(path);
2973+ return dentry_open(path->dentry, path->mnt, flags, current_cred());
2974+}
2975+
2976+static void ovl_put_super(struct super_block *sb)
2977+{
2978+ struct ovl_fs *ufs = sb->s_fs_info;
2979+
2980+ if (!(sb->s_flags & MS_RDONLY))
2981+ mnt_drop_write(ufs->upper_mnt);
2982+
2983+ mntput(ufs->upper_mnt);
2984+ mntput(ufs->lower_mnt);
2985+
2986+ kfree(ufs->config.lowerdir);
2987+ kfree(ufs->config.upperdir);
2988+ kfree(ufs);
2989+}
2990+
2991+static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data)
2992+{
2993+ int flags = *flagsp;
2994+ struct ovl_fs *ufs = sb->s_fs_info;
2995+
2996+ /* When remounting rw or ro, we need to adjust the write access to the
2997+ * upper fs.
2998+ */
2999+ if (((flags ^ sb->s_flags) & MS_RDONLY) == 0)
3000+ /* No change to readonly status */
3001+ return 0;
3002+
3003+ if (flags & MS_RDONLY) {
3004+ mnt_drop_write(ufs->upper_mnt);
3005+ return 0;
3006+ } else
3007+ return mnt_want_write(ufs->upper_mnt);
3008+}
3009+
3010+/**
3011+ * ovl_statfs
3012+ * @sb: The overlayfs super block
3013+ * @buf: The struct kstatfs to fill in with stats
3014+ *
3015+ * Get the filesystem statistics. As writes always target the upper layer
3016+ * filesystem pass the statfs to the same filesystem.
3017+ */
3018+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
3019+{
3020+ struct dentry *root_dentry = dentry->d_sb->s_root;
3021+ struct path path;
3022+ ovl_path_upper(root_dentry, &path);
3023+
3024+ if (!path.dentry->d_sb->s_op->statfs)
3025+ return -ENOSYS;
3026+ return path.dentry->d_sb->s_op->statfs(path.dentry, buf);
3027+}
3028+
3029+/**
3030+ * ovl_show_options
3031+ *
3032+ * Prints the mount options for a given superblock.
3033+ * Returns zero; does not fail.
3034+ */
3035+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
3036+{
3037+ struct super_block *sb = dentry->d_sb;
3038+ struct ovl_fs *ufs = sb->s_fs_info;
3039+
3040+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
3041+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
3042+ return 0;
3043+}
3044+
3045+static const struct super_operations ovl_super_operations = {
3046+ .put_super = ovl_put_super,
3047+ .remount_fs = ovl_remount_fs,
3048+ .statfs = ovl_statfs,
3049+ .show_options = ovl_show_options,
3050+};
3051+
3052+enum {
3053+ Opt_lowerdir,
3054+ Opt_upperdir,
3055+ Opt_err,
3056+};
3057+
3058+static const match_table_t ovl_tokens = {
3059+ {Opt_lowerdir, "lowerdir=%s"},
3060+ {Opt_upperdir, "upperdir=%s"},
3061+ {Opt_err, NULL}
3062+};
3063+
3064+static int ovl_parse_opt(char *opt, struct ovl_config *config)
3065+{
3066+ char *p;
3067+
3068+ config->upperdir = NULL;
3069+ config->lowerdir = NULL;
3070+
3071+ while ((p = strsep(&opt, ",")) != NULL) {
3072+ int token;
3073+ substring_t args[MAX_OPT_ARGS];
3074+
3075+ if (!*p)
3076+ continue;
3077+
3078+ token = match_token(p, ovl_tokens, args);
3079+ switch (token) {
3080+ case Opt_upperdir:
3081+ kfree(config->upperdir);
3082+ config->upperdir = match_strdup(&args[0]);
3083+ if (!config->upperdir)
3084+ return -ENOMEM;
3085+ break;
3086+
3087+ case Opt_lowerdir:
3088+ kfree(config->lowerdir);
3089+ config->lowerdir = match_strdup(&args[0]);
3090+ if (!config->lowerdir)
3091+ return -ENOMEM;
3092+ break;
3093+
3094+ default:
3095+ return -EINVAL;
3096+ }
3097+ }
3098+ return 0;
3099+}
3100+
3101+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
3102+{
3103+ struct path lowerpath;
3104+ struct path upperpath;
3105+ struct inode *root_inode;
3106+ struct dentry *root_dentry;
3107+ struct ovl_entry *oe;
3108+ struct ovl_fs *ufs;
3109+ int err;
3110+
3111+ err = -ENOMEM;
3112+ ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
3113+ if (!ufs)
3114+ goto out;
3115+
3116+ err = ovl_parse_opt((char *) data, &ufs->config);
3117+ if (err)
3118+ goto out_free_ufs;
3119+
3120+ err = -EINVAL;
3121+ if (!ufs->config.upperdir || !ufs->config.lowerdir) {
3122+ printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");
3123+ goto out_free_config;
3124+ }
3125+
3126+ oe = ovl_alloc_entry();
3127+ if (oe == NULL)
3128+ goto out_free_config;
3129+
3130+ err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath);
3131+ if (err)
3132+ goto out_free_oe;
3133+
3134+ err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath);
3135+ if (err)
3136+ goto out_put_upperpath;
3137+
3138+ err = -ENOTDIR;
3139+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
3140+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode))
3141+ goto out_put_lowerpath;
3142+
3143+ sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
3144+ lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
3145+
3146+ err = -EINVAL;
3147+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
3148+ printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n");
3149+ goto out_put_lowerpath;
3150+ }
3151+
3152+
3153+ ufs->upper_mnt = clone_private_mount(&upperpath);
3154+ err = PTR_ERR(ufs->upper_mnt);
3155+ if (IS_ERR(ufs->upper_mnt)) {
3156+ printk(KERN_ERR "overlayfs: failed to clone upperpath\n");
3157+ goto out_put_lowerpath;
3158+ }
3159+
3160+ ufs->lower_mnt = clone_private_mount(&lowerpath);
3161+ err = PTR_ERR(ufs->lower_mnt);
3162+ if (IS_ERR(ufs->lower_mnt)) {
3163+ printk(KERN_ERR "overlayfs: failed to clone lowerpath\n");
3164+ goto out_put_upper_mnt;
3165+ }
3166+
3167+ /*
3168+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
3169+ * will fail instead of modifying lower fs.
3170+ */
3171+ ufs->lower_mnt->mnt_flags |= MNT_READONLY;
3172+
3173+ /* If the upper fs is r/o, we mark overlayfs r/o too */
3174+ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
3175+ sb->s_flags |= MS_RDONLY;
3176+
3177+ if (!(sb->s_flags & MS_RDONLY)) {
3178+ err = mnt_want_write(ufs->upper_mnt);
3179+ if (err)
3180+ goto out_put_lower_mnt;
3181+ }
3182+
3183+ err = -ENOMEM;
3184+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
3185+ if (!root_inode)
3186+ goto out_drop_write;
3187+
3188+ root_dentry = d_make_root(root_inode);
3189+ if (!root_dentry)
3190+ goto out_drop_write;
3191+
3192+ mntput(upperpath.mnt);
3193+ mntput(lowerpath.mnt);
3194+
3195+ oe->__upperdentry = dget(upperpath.dentry);
3196+ oe->lowerdentry = lowerpath.dentry;
3197+
3198+ root_dentry->d_fsdata = oe;
3199+ root_dentry->d_op = &ovl_dentry_operations;
3200+
3201+ sb->s_op = &ovl_super_operations;
3202+ sb->s_root = root_dentry;
3203+ sb->s_fs_info = ufs;
3204+
3205+ return 0;
3206+
3207+out_drop_write:
3208+ if (!(sb->s_flags & MS_RDONLY))
3209+ mnt_drop_write(ufs->upper_mnt);
3210+out_put_lower_mnt:
3211+ mntput(ufs->lower_mnt);
3212+out_put_upper_mnt:
3213+ mntput(ufs->upper_mnt);
3214+out_put_lowerpath:
3215+ path_put(&lowerpath);
3216+out_put_upperpath:
3217+ path_put(&upperpath);
3218+out_free_oe:
3219+ kfree(oe);
3220+out_free_config:
3221+ kfree(ufs->config.lowerdir);
3222+ kfree(ufs->config.upperdir);
3223+out_free_ufs:
3224+ kfree(ufs);
3225+out:
3226+ return err;
3227+}
3228+
3229+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
3230+ const char *dev_name, void *raw_data)
3231+{
3232+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
3233+}
3234+
3235+static struct file_system_type ovl_fs_type = {
3236+ .owner = THIS_MODULE,
3237+ .name = "overlayfs",
3238+ .mount = ovl_mount,
3239+ .kill_sb = kill_anon_super,
3240+};
3241+
3242+static int __init ovl_init(void)
3243+{
3244+ return register_filesystem(&ovl_fs_type);
3245+}
3246+
3247+static void __exit ovl_exit(void)
3248+{
3249+ unregister_filesystem(&ovl_fs_type);
3250+}
3251+
3252+module_init(ovl_init);
3253+module_exit(ovl_exit);
3254diff -ruNb a//fs/splice.c b//fs/splice.c
3255--- a//fs/splice.c 2012-10-12 21:48:25.000000000 +0100
3256+++ b//fs/splice.c 2012-10-21 15:32:47.265212787 +0100
3257@@ -1305,6 +1305,7 @@
3258
3259 return ret;
3260 }
3261+EXPORT_SYMBOL(do_splice_direct);
3262
3263 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
3264 struct pipe_inode_info *opipe,
3265diff -ruNb a//include/linux/fs.h b//include/linux/fs.h
3266--- a//include/linux/fs.h 2012-10-12 21:48:25.000000000 +0100
3267+++ b//include/linux/fs.h 2012-10-21 15:35:00.152382302 +0100
3268@@ -499,6 +499,12 @@
3269 */
3270 #include <linux/quota.h>
3271
3272+/*
3273+ * Maximum number of layers of fs stack. Needs to be limited to
3274+ * prevent kernel stack overflow
3275+ */
3276+#define FILESYSTEM_MAX_STACK_DEPTH 2
3277+
3278 /**
3279 * enum positive_aop_returns - aop return codes with specific semantics
3280 *
3281@@ -1542,6 +1548,11 @@
3282
3283 /* Being remounted read-only */
3284 int s_readonly_remount;
3285+
3286+ /*
3287+ * Indicates how deep in a filesystem stack this SB is
3288+ */
3289+ int s_stack_depth;
3290 };
3291
3292 /* superblock cache pruning functions */
3293@@ -1693,6 +1704,8 @@
3294 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
3295 u64 len);
3296 int (*update_time)(struct inode *, struct timespec *, int);
3297+ struct file *(*open) (struct dentry *, struct file *,
3298+ const struct cred *);
3299 } ____cacheline_aligned;
3300
3301 struct seq_file;
3302@@ -2057,6 +2070,7 @@
3303 extern struct file *filp_open(const char *, int, umode_t);
3304 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
3305 const char *, int);
3306+extern struct file *vfs_open(struct path *, struct file *, const struct cred *);
3307 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
3308 const struct cred *);
3309 extern int filp_close(struct file *, fl_owner_t id);
3310@@ -2249,6 +2263,7 @@
3311 #endif
3312 extern int notify_change(struct dentry *, struct iattr *);
3313 extern int inode_permission(struct inode *, int);
3314+extern int inode_only_permission(struct inode *, int);
3315 extern int generic_permission(struct inode *, int);
3316
3317 static inline bool execute_ok(struct inode *inode)
3318diff -ruNb a//include/linux/mount.h b//include/linux/mount.h
3319--- a//include/linux/mount.h 2012-10-12 21:48:25.000000000 +0100
3320+++ b//include/linux/mount.h 2012-10-21 15:33:09.262261274 +0100
3321@@ -66,6 +66,9 @@
3322 extern void mnt_unpin(struct vfsmount *mnt);
3323 extern int __mnt_is_readonly(struct vfsmount *mnt);
3324
3325+struct path;
3326+extern struct vfsmount *clone_private_mount(struct path *path);
3327+
3328 struct file_system_type;
3329 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
3330 int flags, const char *name,
3331diff -ruNb a//MAINTAINERS b//MAINTAINERS
3332--- a//MAINTAINERS 2012-10-12 21:48:25.000000000 +0100
3333+++ b//MAINTAINERS 2012-10-21 15:34:04.767813670 +0100
3334@@ -5066,6 +5066,13 @@
3335 F: include/scsi/osd_*
3336 F: fs/exofs/
3337
3338+OVERLAYFS FILESYSTEM
3339+M: Miklos Szeredi <miklos@szeredi.hu>
3340+L: linux-fsdevel@vger.kernel.org
3341+S: Supported
3342+F: fs/overlayfs/*
3343+F: Documentation/filesystems/overlayfs.txt
3344+
3345 P54 WIRELESS DRIVER
3346 M: Christian Lamparter <chunkeey@googlemail.com>
3347 L: linux-wireless@vger.kernel.org
3348diff -ruNb a//security/apparmor/apparmorfs.c b//security/apparmor/apparmorfs.c
3349--- a//security/apparmor/apparmorfs.c 2012-10-12 21:48:25.000000000 +0100
3350+++ b//security/apparmor/apparmorfs.c 2012-10-21 15:35:27.442720548 +0100
3351@@ -198,9 +198,22 @@
3352 { }
3353 };
3354
3355+static struct aa_fs_entry aa_fs_entry_mount[] = {
3356+ AA_FS_FILE_STRING("mask", "mount umount"),
3357+ { }
3358+};
3359+
3360+static struct aa_fs_entry aa_fs_entry_namespaces[] = {
3361+ AA_FS_FILE_BOOLEAN("profile", 1),
3362+ AA_FS_FILE_BOOLEAN("pivot_root", 1),
3363+ { }
3364+};
3365+
3366 static struct aa_fs_entry aa_fs_entry_features[] = {
3367 AA_FS_DIR("domain", aa_fs_entry_domain),
3368 AA_FS_DIR("file", aa_fs_entry_file),
3369+ AA_FS_DIR("mount", aa_fs_entry_mount),
3370+ AA_FS_DIR("namespaces", aa_fs_entry_namespaces),
3371 AA_FS_FILE_U64("capability", VFS_CAP_FLAGS_MASK),
3372 AA_FS_DIR("rlimit", aa_fs_entry_rlimit),
3373 { }
3374diff -ruNb a//security/apparmor/audit.c b//security/apparmor/audit.c
3375--- a//security/apparmor/audit.c 2012-10-12 21:48:25.000000000 +0100
3376+++ b//security/apparmor/audit.c 2012-10-21 15:35:27.442720548 +0100
3377@@ -44,6 +44,10 @@
3378 "file_mmap",
3379 "file_mprotect",
3380
3381+ "pivotroot",
3382+ "mount",
3383+ "umount",
3384+
3385 "create",
3386 "post_create",
3387 "bind",
3388diff -ruNb a//security/apparmor/domain.c b//security/apparmor/domain.c
3389--- a//security/apparmor/domain.c 2012-10-12 21:48:25.000000000 +0100
3390+++ b//security/apparmor/domain.c 2012-10-21 15:35:27.443720414 +0100
3391@@ -242,7 +242,7 @@
3392 *
3393 * Returns: refcounted profile, or NULL on failure (MAYBE NULL)
3394 */
3395-static struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex)
3396+struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex)
3397 {
3398 struct aa_profile *new_profile = NULL;
3399 struct aa_namespace *ns = profile->ns;
3400diff -ruNb a//security/apparmor/include/apparmor.h b//security/apparmor/include/apparmor.h
3401--- a//security/apparmor/include/apparmor.h 2012-10-12 21:48:25.000000000 +0100
3402+++ b//security/apparmor/include/apparmor.h 2012-10-21 15:35:27.443720414 +0100
3403@@ -29,8 +29,9 @@
3404 #define AA_CLASS_NET 4
3405 #define AA_CLASS_RLIMITS 5
3406 #define AA_CLASS_DOMAIN 6
3407+#define AA_CLASS_MOUNT 7
3408
3409-#define AA_CLASS_LAST AA_CLASS_DOMAIN
3410+#define AA_CLASS_LAST AA_CLASS_MOUNT
3411
3412 /* Control parameters settable through module/boot flags */
3413 extern enum audit_mode aa_g_audit;
3414diff -ruNb a//security/apparmor/include/audit.h b//security/apparmor/include/audit.h
3415--- a//security/apparmor/include/audit.h 2012-10-12 21:48:25.000000000 +0100
3416+++ b//security/apparmor/include/audit.h 2012-10-21 15:35:27.443720414 +0100
3417@@ -73,6 +73,10 @@
3418 OP_FMMAP,
3419 OP_FMPROT,
3420
3421+ OP_PIVOTROOT,
3422+ OP_MOUNT,
3423+ OP_UMOUNT,
3424+
3425 OP_CREATE,
3426 OP_POST_CREATE,
3427 OP_BIND,
3428@@ -122,6 +126,13 @@
3429 unsigned long max;
3430 } rlim;
3431 struct {
3432+ const char *src_name;
3433+ const char *type;
3434+ const char *trans;
3435+ const char *data;
3436+ unsigned long flags;
3437+ } mnt;
3438+ struct {
3439 const char *target;
3440 u32 request;
3441 u32 denied;
3442diff -ruNb a//security/apparmor/include/domain.h b//security/apparmor/include/domain.h
3443--- a//security/apparmor/include/domain.h 2012-10-12 21:48:25.000000000 +0100
3444+++ b//security/apparmor/include/domain.h 2012-10-21 15:35:27.443720414 +0100
3445@@ -23,6 +23,8 @@
3446 char **table;
3447 };
3448
3449+struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex);
3450+
3451 int apparmor_bprm_set_creds(struct linux_binprm *bprm);
3452 int apparmor_bprm_secureexec(struct linux_binprm *bprm);
3453 void apparmor_bprm_committing_creds(struct linux_binprm *bprm);
3454diff -ruNb a//security/apparmor/include/mount.h b//security/apparmor/include/mount.h
3455--- a//security/apparmor/include/mount.h 1970-01-01 01:00:00.000000000 +0100
3456+++ b//security/apparmor/include/mount.h 2012-10-21 15:35:27.443720414 +0100
3457@@ -0,0 +1,54 @@
3458+/*
3459+ * AppArmor security module
3460+ *
3461+ * This file contains AppArmor file mediation function definitions.
3462+ *
3463+ * Copyright 2012 Canonical Ltd.
3464+ *
3465+ * This program is free software; you can redistribute it and/or
3466+ * modify it under the terms of the GNU General Public License as
3467+ * published by the Free Software Foundation, version 2 of the
3468+ * License.
3469+ */
3470+
3471+#ifndef __AA_MOUNT_H
3472+#define __AA_MOUNT_H
3473+
3474+#include <linux/fs.h>
3475+#include <linux/path.h>
3476+
3477+#include "domain.h"
3478+#include "policy.h"
3479+
3480+/* mount perms */
3481+#define AA_MAY_PIVOTROOT 0x01
3482+#define AA_MAY_MOUNT 0x02
3483+#define AA_MAY_UMOUNT 0x04
3484+#define AA_AUDIT_DATA 0x40
3485+#define AA_CONT_MATCH 0x40
3486+
3487+#define AA_MS_IGNORE_MASK (MS_KERNMOUNT | MS_NOSEC | MS_ACTIVE | MS_BORN)
3488+
3489+int aa_remount(struct aa_profile *profile, struct path *path,
3490+ unsigned long flags, void *data);
3491+
3492+int aa_bind_mount(struct aa_profile *profile, struct path *path,
3493+ const char *old_name, unsigned long flags);
3494+
3495+
3496+int aa_mount_change_type(struct aa_profile *profile, struct path *path,
3497+ unsigned long flags);
3498+
3499+int aa_move_mount(struct aa_profile *profile, struct path *path,
3500+ const char *old_name);
3501+
3502+int aa_new_mount(struct aa_profile *profile, const char *dev_name,
3503+ struct path *path, const char *type, unsigned long flags,
3504+ void *data);
3505+
3506+int aa_umount(struct aa_profile *profile, struct vfsmount *mnt, int flags);
3507+
3508+int aa_pivotroot(struct aa_profile *profile, struct path *old_path,
3509+ struct path *new_path);
3510+
3511+#endif /* __AA_MOUNT_H */
3512diff -ruNb a//security/apparmor/lsm.c b//security/apparmor/lsm.c
3513--- a//security/apparmor/lsm.c 2012-10-12 21:48:25.000000000 +0100
3514+++ b//security/apparmor/lsm.c 2012-10-21 15:35:27.444720280 +0100
3515@@ -35,6 +35,7 @@
3516 #include "include/path.h"
3517 #include "include/policy.h"
3518 #include "include/procattr.h"
3519+#include "include/mount.h"
3520
3521 /* Flag indicating whether initialization completed */
3522 int apparmor_initialized __initdata;
3523@@ -503,6 +504,60 @@
3524 !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0);
3525 }
3526
3527+static int apparmor_sb_mount(char *dev_name, struct path *path, char *type,
3528+ unsigned long flags, void *data)
3529+{
3530+ struct aa_profile *profile;
3531+ int error = 0;
3532+
3533+ /* Discard magic */
3534+ if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3535+ flags &= ~MS_MGC_MSK;
3536+
3537+ flags &= ~AA_MS_IGNORE_MASK;
3538+
3539+ profile = __aa_current_profile();
3540+ if (!unconfined(profile)) {
3541+ if (flags & MS_REMOUNT)
3542+ error = aa_remount(profile, path, flags, data);
3543+ else if (flags & MS_BIND)
3544+ error = aa_bind_mount(profile, path, dev_name, flags);
3545+ else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE |
3546+ MS_UNBINDABLE))
3547+ error = aa_mount_change_type(profile, path, flags);
3548+ else if (flags & MS_MOVE)
3549+ error = aa_move_mount(profile, path, dev_name);
3550+ else
3551+ error = aa_new_mount(profile, dev_name, path, type,
3552+ flags, data);
3553+ }
3554+ return error;
3555+}
3556+
3557+static int apparmor_sb_umount(struct vfsmount *mnt, int flags)
3558+{
3559+ struct aa_profile *profile;
3560+ int error = 0;
3561+
3562+ profile = __aa_current_profile();
3563+ if (!unconfined(profile))
3564+ error = aa_umount(profile, mnt, flags);
3565+
3566+ return error;
3567+}
3568+
3569+static int apparmor_sb_pivotroot(struct path *old_path, struct path *new_path)
3570+{
3571+ struct aa_profile *profile;
3572+ int error = 0;
3573+
3574+ profile = __aa_current_profile();
3575+ if (!unconfined(profile))
3576+ error = aa_pivotroot(profile, old_path, new_path);
3577+
3578+ return error;
3579+}
3580+
3581 static int apparmor_getprocattr(struct task_struct *task, char *name,
3582 char **value)
3583 {
3584@@ -622,6 +677,10 @@
3585 .capget = apparmor_capget,
3586 .capable = apparmor_capable,
3587
3588+ .sb_mount = apparmor_sb_mount,
3589+ .sb_umount = apparmor_sb_umount,
3590+ .sb_pivotroot = apparmor_sb_pivotroot,
3591+
3592 .path_link = apparmor_path_link,
3593 .path_unlink = apparmor_path_unlink,
3594 .path_symlink = apparmor_path_symlink,
3595diff -ruNb a//security/apparmor/Makefile b//security/apparmor/Makefile
3596--- a//security/apparmor/Makefile 2012-10-12 21:48:25.000000000 +0100
3597+++ b//security/apparmor/Makefile 2012-10-21 15:35:27.442720548 +0100
3598@@ -4,11 +4,10 @@
3599
3600 apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \
3601 path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \
3602- resource.o sid.o file.o
3603+ resource.o sid.o file.o mount.o
3604
3605 clean-files := capability_names.h rlim_names.h
3606
3607-
3608 # Build a lower case string table of capability names
3609 # Transforms lines from
3610 # #define CAP_DAC_OVERRIDE 1
3611diff -ruNb a//security/apparmor/mount.c b//security/apparmor/mount.c
3612--- a//security/apparmor/mount.c 1970-01-01 01:00:00.000000000 +0100
3613+++ b//security/apparmor/mount.c 2012-10-21 15:35:27.444720280 +0100
3614@@ -0,0 +1,620 @@
3615+/*
3616+ * AppArmor security module
3617+ *
3618+ * This file contains AppArmor mediation of files
3619+ *
3620+ * Copyright (C) 1998-2008 Novell/SUSE
3621+ * Copyright 2009-2012 Canonical Ltd.
3622+ *
3623+ * This program is free software; you can redistribute it and/or
3624+ * modify it under the terms of the GNU General Public License as
3625+ * published by the Free Software Foundation, version 2 of the
3626+ * License.
3627+ */
3628+
3629+#include <linux/fs.h>
3630+#include <linux/mount.h>
3631+#include <linux/namei.h>
3632+
3633+#include "include/apparmor.h"
3634+#include "include/audit.h"
3635+#include "include/context.h"
3636+#include "include/domain.h"
3637+#include "include/file.h"
3638+#include "include/match.h"
3639+#include "include/mount.h"
3640+#include "include/path.h"
3641+#include "include/policy.h"
3642+
3643+
3644+static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags)
3645+{
3646+ if (flags & MS_RDONLY)
3647+ audit_log_format(ab, "ro");
3648+ else
3649+ audit_log_format(ab, "rw");
3650+ if (flags & MS_NOSUID)
3651+ audit_log_format(ab, ", nosuid");
3652+ if (flags & MS_NODEV)
3653+ audit_log_format(ab, ", nodev");
3654+ if (flags & MS_NOEXEC)
3655+ audit_log_format(ab, ", noexec");
3656+ if (flags & MS_SYNCHRONOUS)
3657+ audit_log_format(ab, ", sync");
3658+ if (flags & MS_REMOUNT)
3659+ audit_log_format(ab, ", remount");
3660+ if (flags & MS_MANDLOCK)
3661+ audit_log_format(ab, ", mand");
3662+ if (flags & MS_DIRSYNC)
3663+ audit_log_format(ab, ", dirsync");
3664+ if (flags & MS_NOATIME)
3665+ audit_log_format(ab, ", noatime");
3666+ if (flags & MS_NODIRATIME)
3667+ audit_log_format(ab, ", nodiratime");
3668+ if (flags & MS_BIND)
3669+ audit_log_format(ab, flags & MS_REC ? ", rbind" : ", bind");
3670+ if (flags & MS_MOVE)
3671+ audit_log_format(ab, ", move");
3672+ if (flags & MS_SILENT)
3673+ audit_log_format(ab, ", silent");
3674+ if (flags & MS_POSIXACL)
3675+ audit_log_format(ab, ", acl");
3676+ if (flags & MS_UNBINDABLE)
3677+ audit_log_format(ab, flags & MS_REC ? ", runbindable" :
3678+ ", unbindable");
3679+ if (flags & MS_PRIVATE)
3680+ audit_log_format(ab, flags & MS_REC ? ", rprivate" :
3681+ ", private");
3682+ if (flags & MS_SLAVE)
3683+ audit_log_format(ab, flags & MS_REC ? ", rslave" :
3684+ ", slave");
3685+ if (flags & MS_SHARED)
3686+ audit_log_format(ab, flags & MS_REC ? ", rshared" :
3687+ ", shared");
3688+ if (flags & MS_RELATIME)
3689+ audit_log_format(ab, ", relatime");
3690+ if (flags & MS_I_VERSION)
3691+ audit_log_format(ab, ", iversion");
3692+ if (flags & MS_STRICTATIME)
3693+ audit_log_format(ab, ", strictatime");
3694+ if (flags & MS_NOUSER)
3695+ audit_log_format(ab, ", nouser");
3696+}
3697+
3698+/**
3699+ * audit_cb - call back for mount specific audit fields
3700+ * @ab: audit_buffer (NOT NULL)
3701+ * @va: audit struct to audit values of (NOT NULL)
3702+ */
3703+static void audit_cb(struct audit_buffer *ab, void *va)
3704+{
3705+ struct common_audit_data *sa = va;
3706+
3707+ if (sa->aad->mnt.type) {
3708+ audit_log_format(ab, " fstype=");
3709+ audit_log_untrustedstring(ab, sa->aad->mnt.type);
3710+ }
3711+ if (sa->aad->mnt.src_name) {
3712+ audit_log_format(ab, " srcname=");
3713+ audit_log_untrustedstring(ab, sa->aad->mnt.src_name);
3714+ }
3715+ if (sa->aad->mnt.trans) {
3716+ audit_log_format(ab, " trans=");
3717+ audit_log_untrustedstring(ab, sa->aad->mnt.trans);
3718+ }
3719+ if (sa->aad->mnt.flags || sa->aad->op == OP_MOUNT) {
3720+ audit_log_format(ab, " flags=\"");
3721+ audit_mnt_flags(ab, sa->aad->mnt.flags);
3722+ audit_log_format(ab, "\"");
3723+ }
3724+ if (sa->aad->mnt.data) {
3725+ audit_log_format(ab, " options=");
3726+ audit_log_untrustedstring(ab, sa->aad->mnt.data);
3727+ }
3728+}
3729+
3730+/**
3731+ * audit_mount - handle the auditing of mount operations
3732+ * @profile: the profile being enforced (NOT NULL)
3733+ * @gfp: allocation flags
3734+ * @op: operation being mediated (NOT NULL)
3735+ * @name: name of object being mediated (MAYBE NULL)
3736+ * @src_name: src_name of object being mediated (MAYBE_NULL)
3737+ * @type: type of filesystem (MAYBE_NULL)
3738+ * @trans: name of trans (MAYBE NULL)
3739+ * @flags: filesystem idependent mount flags
3740+ * @data: filesystem mount flags
3741+ * @request: permissions requested
3742+ * @perms: the permissions computed for the request (NOT NULL)
3743+ * @info: extra information message (MAYBE NULL)
3744+ * @error: 0 if operation allowed else failure error code
3745+ *
3746+ * Returns: %0 or error on failure
3747+ */
3748+static int audit_mount(struct aa_profile *profile, gfp_t gfp, int op,
3749+ const char *name, const char *src_name,
3750+ const char *type, const char *trans,
3751+ unsigned long flags, const void *data, u32 request,
3752+ struct file_perms *perms, const char *info, int error)
3753+{
3754+ int audit_type = AUDIT_APPARMOR_AUTO;
3755+ struct common_audit_data sa;
3756+ struct apparmor_audit_data aad = { };
3757+
3758+ if (likely(!error)) {
3759+ u32 mask = perms->audit;
3760+
3761+ if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL))
3762+ mask = 0xffff;
3763+
3764+ /* mask off perms that are not being force audited */
3765+ request &= mask;
3766+
3767+ if (likely(!request))
3768+ return 0;
3769+ audit_type = AUDIT_APPARMOR_AUDIT;
3770+ } else {
3771+ /* only report permissions that were denied */
3772+ request = request & ~perms->allow;
3773+
3774+ if (request & perms->kill)
3775+ audit_type = AUDIT_APPARMOR_KILL;
3776+
3777+ /* quiet known rejects, assumes quiet and kill do not overlap */
3778+ if ((request & perms->quiet) &&
3779+ AUDIT_MODE(profile) != AUDIT_NOQUIET &&
3780+ AUDIT_MODE(profile) != AUDIT_ALL)
3781+ request &= ~perms->quiet;
3782+
3783+ if (!request)
3784+ return COMPLAIN_MODE(profile) ?
3785+ complain_error(error) : error;
3786+ }
3787+
3788+ sa.type = LSM_AUDIT_DATA_NONE;
3789+ sa.aad = &aad;
3790+ sa.aad->op = op;
3791+ sa.aad->name = name;
3792+ sa.aad->mnt.src_name = src_name;
3793+ sa.aad->mnt.type = type;
3794+ sa.aad->mnt.trans = trans;
3795+ sa.aad->mnt.flags = flags;
3796+ if (data && (perms->audit & AA_AUDIT_DATA))
3797+ sa.aad->mnt.data = data;
3798+ sa.aad->info = info;
3799+ sa.aad->error = error;
3800+
3801+ return aa_audit(audit_type, profile, gfp, &sa, audit_cb);
3802+}
3803+
3804+/**
3805+ * match_mnt_flags - Do an ordered match on mount flags
3806+ * @dfa: dfa to match against
3807+ * @state: state to start in
3808+ * @flags: mount flags to match against
3809+ *
3810+ * Mount flags are encoded as an ordered match. This is done instead of
3811+ * checking against a simple bitmask, to allow for logical operations
3812+ * on the flags.
3813+ *
3814+ * Returns: next state after flags match
3815+ */
3816+static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state,
3817+ unsigned long flags)
3818+{
3819+ unsigned int i;
3820+
3821+ for (i = 0; i <= 31 ; ++i) {
3822+ if ((1 << i) & flags)
3823+ state = aa_dfa_next(dfa, state, i + 1);
3824+ }
3825+
3826+ return state;
3827+}
3828+
3829+/**
3830+ * compute_mnt_perms - compute mount permission associated with @state
3831+ * @dfa: dfa to match against (NOT NULL)
3832+ * @state: state match finished in
3833+ *
3834+ * Returns: mount permissions
3835+ */
3836+static struct file_perms compute_mnt_perms(struct aa_dfa *dfa,
3837+ unsigned int state)
3838+{
3839+ struct file_perms perms;
3840+
3841+ perms.kill = 0;
3842+ perms.allow = dfa_user_allow(dfa, state);
3843+ perms.audit = dfa_user_audit(dfa, state);
3844+ perms.quiet = dfa_user_quiet(dfa, state);
3845+ perms.xindex = dfa_user_xindex(dfa, state);
3846+
3847+ return perms;
3848+}
3849+
3850+static const char const *mnt_info_table[] = {
3851+ "match succeeded",
3852+ "failed mntpnt match",
3853+ "failed srcname match",
3854+ "failed type match",
3855+ "failed flags match",
3856+ "failed data match"
3857+};
3858+
3859+/*
3860+ * Returns 0 on success else element that match failed in, this is the
3861+ * index into the mnt_info_table above
3862+ */
3863+static int do_match_mnt(struct aa_dfa *dfa, unsigned int start,
3864+ const char *mntpnt, const char *devname,
3865+ const char *type, unsigned long flags,
3866+ void *data, bool binary, struct file_perms *perms)
3867+{
3868+ unsigned int state;
3869+
3870+ state = aa_dfa_match(dfa, start, mntpnt);
3871+ state = aa_dfa_null_transition(dfa, state);
3872+ if (!state)
3873+ return 1;
3874+
3875+ if (devname)
3876+ state = aa_dfa_match(dfa, state, devname);
3877+ state = aa_dfa_null_transition(dfa, state);
3878+ if (!state)
3879+ return 2;
3880+
3881+ if (type)
3882+ state = aa_dfa_match(dfa, state, type);
3883+ state = aa_dfa_null_transition(dfa, state);
3884+ if (!state)
3885+ return 3;
3886+
3887+ state = match_mnt_flags(dfa, state, flags);
3888+ if (!state)
3889+ return 4;
3890+ *perms = compute_mnt_perms(dfa, state);
3891+ if (perms->allow & AA_MAY_MOUNT)
3892+ return 0;
3893+
3894+ /* only match data if not binary and the DFA flags data is expected */
3895+ if (data && !binary && (perms->allow & AA_CONT_MATCH)) {
3896+ state = aa_dfa_null_transition(dfa, state);
3897+ if (!state)
3898+ return 4;
3899+
3900+ state = aa_dfa_match(dfa, state, data);
3901+ if (!state)
3902+ return 5;
3903+ *perms = compute_mnt_perms(dfa, state);
3904+ if (perms->allow & AA_MAY_MOUNT)
3905+ return 0;
3906+ }
3907+
3908+ /* failed at end of flags match */
3909+ return 4;
3910+}
3911+
3912+/**
3913+ * match_mnt - handle path matching for mount
3914+ * @profile: the confining profile
3915+ * @mntpnt: string for the mntpnt (NOT NULL)
3916+ * @devname: string for the devname/src_name (MAYBE NULL)
3917+ * @type: string for the dev type (MAYBE NULL)
3918+ * @flags: mount flags to match
3919+ * @data: fs mount data (MAYBE NULL)
3920+ * @binary: whether @data is binary
3921+ * @perms: Returns: permission found by the match
3922+ * @info: Returns: infomation string about the match for logging
3923+ *
3924+ * Returns: 0 on success else error
3925+ */
3926+static int match_mnt(struct aa_profile *profile, const char *mntpnt,
3927+ const char *devname, const char *type,
3928+ unsigned long flags, void *data, bool binary,
3929+ struct file_perms *perms, const char **info)
3930+{
3931+ int pos;
3932+
3933+ if (!profile->policy.dfa)
3934+ return -EACCES;
3935+
3936+ pos = do_match_mnt(profile->policy.dfa,
3937+ profile->policy.start[AA_CLASS_MOUNT],
3938+ mntpnt, devname, type, flags, data, binary, perms);
3939+ if (pos) {
3940+ *info = mnt_info_table[pos];
3941+ return -EACCES;
3942+ }
3943+
3944+ return 0;
3945+}
3946+
3947+static int path_flags(struct aa_profile *profile, struct path *path)
3948+{
3949+ return profile->path_flags |
3950+ S_ISDIR(path->dentry->d_inode->i_mode) ? PATH_IS_DIR : 0;
3951+}
3952+
3953+int aa_remount(struct aa_profile *profile, struct path *path,
3954+ unsigned long flags, void *data)
3955+{
3956+ struct file_perms perms = { };
3957+ const char *name, *info = NULL;
3958+ char *buffer = NULL;
3959+ int binary, error;
3960+
3961+ binary = path->dentry->d_sb->s_type->fs_flags & FS_BINARY_MOUNTDATA;
3962+
3963+ error = aa_path_name(path, path_flags(profile, path), &buffer, &name,
3964+ &info);
3965+ if (error)
3966+ goto audit;
3967+
3968+ error = match_mnt(profile, name, NULL, NULL, flags, data, binary,
3969+ &perms, &info);
3970+
3971+audit:
3972+ error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, NULL, NULL,
3973+ NULL, flags, data, AA_MAY_MOUNT, &perms, info,
3974+ error);
3975+ kfree(buffer);
3976+
3977+ return error;
3978+}
3979+
3980+int aa_bind_mount(struct aa_profile *profile, struct path *path,
3981+ const char *dev_name, unsigned long flags)
3982+{
3983+ struct file_perms perms = { };
3984+ char *buffer = NULL, *old_buffer = NULL;
3985+ const char *name, *old_name = NULL, *info = NULL;
3986+ struct path old_path;
3987+ int error;
3988+
3989+ if (!dev_name || !*dev_name)
3990+ return -EINVAL;
3991+
3992+ flags &= MS_REC | MS_BIND;
3993+
3994+ error = aa_path_name(path, path_flags(profile, path), &buffer, &name,
3995+ &info);
3996+ if (error)
3997+ goto audit;
3998+
3999+ error = kern_path(dev_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
4000+ if (error)
4001+ goto audit;
4002+
4003+ error = aa_path_name(&old_path, path_flags(profile, &old_path),
4004+ &old_buffer, &old_name, &info);
4005+ path_put(&old_path);
4006+ if (error)
4007+ goto audit;
4008+
4009+ error = match_mnt(profile, name, old_name, NULL, flags, NULL, 0,
4010+ &perms, &info);
4011+
4012+audit:
4013+ error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, old_name,
4014+ NULL, NULL, flags, NULL, AA_MAY_MOUNT, &perms,
4015+ info, error);
4016+ kfree(buffer);
4017+ kfree(old_buffer);
4018+
4019+ return error;
4020+}
4021+
4022+int aa_mount_change_type(struct aa_profile *profile, struct path *path,
4023+ unsigned long flags)
4024+{
4025+ struct file_perms perms = { };
4026+ char *buffer = NULL;
4027+ const char *name, *info = NULL;
4028+ int error;
4029+
4030+ /* These are the flags allowed by do_change_type() */
4031+ flags &= (MS_REC | MS_SILENT | MS_SHARED | MS_PRIVATE | MS_SLAVE |
4032+ MS_UNBINDABLE);
4033+
4034+ error = aa_path_name(path, path_flags(profile, path), &buffer, &name,
4035+ &info);
4036+ if (error)
4037+ goto audit;
4038+
4039+ error = match_mnt(profile, name, NULL, NULL, flags, NULL, 0, &perms,
4040+ &info);
4041+
4042+audit:
4043+ error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, NULL, NULL,
4044+ NULL, flags, NULL, AA_MAY_MOUNT, &perms, info,
4045+ error);
4046+ kfree(buffer);
4047+
4048+ return error;
4049+}
4050+
4051+int aa_move_mount(struct aa_profile *profile, struct path *path,
4052+ const char *orig_name)
4053+{
4054+ struct file_perms perms = { };
4055+ char *buffer = NULL, *old_buffer = NULL;
4056+ const char *name, *old_name = NULL, *info = NULL;
4057+ struct path old_path;
4058+ int error;
4059+
4060+ if (!orig_name || !*orig_name)
4061+ return -EINVAL;
4062+
4063+ error = aa_path_name(path, path_flags(profile, path), &buffer, &name,
4064+ &info);
4065+ if (error)
4066+ goto audit;
4067+
4068+ error = kern_path(orig_name, LOOKUP_FOLLOW, &old_path);
4069+ if (error)
4070+ goto audit;
4071+
4072+ error = aa_path_name(&old_path, path_flags(profile, &old_path),
4073+ &old_buffer, &old_name, &info);
4074+ path_put(&old_path);
4075+ if (error)
4076+ goto audit;
4077+
4078+ error = match_mnt(profile, name, old_name, NULL, MS_MOVE, NULL, 0,
4079+ &perms, &info);
4080+
4081+audit:
4082+ error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, old_name,
4083+ NULL, NULL, MS_MOVE, NULL, AA_MAY_MOUNT, &perms,
4084+ info, error);
4085+ kfree(buffer);
4086+ kfree(old_buffer);
4087+
4088+ return error;
4089+}
4090+
4091+int aa_new_mount(struct aa_profile *profile, const char *orig_dev_name,
4092+ struct path *path, const char *type, unsigned long flags,
4093+ void *data)
4094+{
4095+ struct file_perms perms = { };
4096+ char *buffer = NULL, *dev_buffer = NULL;
4097+ const char *name = NULL, *dev_name = NULL, *info = NULL;
4098+ int binary = 1;
4099+ int error;
4100+
4101+ dev_name = orig_dev_name;
4102+ if (type) {
4103+ int requires_dev;
4104+ struct file_system_type *fstype = get_fs_type(type);
4105+ if (!fstype)
4106+ return -ENODEV;
4107+
4108+ binary = fstype->fs_flags & FS_BINARY_MOUNTDATA;
4109+ requires_dev = fstype->fs_flags & FS_REQUIRES_DEV;
4110+ put_filesystem(fstype);
4111+
4112+ if (requires_dev) {
4113+ struct path dev_path;
4114+
4115+ if (!dev_name || !*dev_name) {
4116+ error = -ENOENT;
4117+ goto out;
4118+ }
4119+
4120+ error = kern_path(dev_name, LOOKUP_FOLLOW, &dev_path);
4121+ if (error)
4122+ goto audit;
4123+
4124+ error = aa_path_name(&dev_path,
4125+ path_flags(profile, &dev_path),
4126+ &dev_buffer, &dev_name, &info);
4127+ path_put(&dev_path);
4128+ if (error)
4129+ goto audit;
4130+ }
4131+ }
4132+
4133+ error = aa_path_name(path, path_flags(profile, path), &buffer, &name,
4134+ &info);
4135+ if (error)
4136+ goto audit;
4137+
4138+ error = match_mnt(profile, name, dev_name, type, flags, data, binary,
4139+ &perms, &info);
4140+
4141+audit:
4142+ error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, dev_name,
4143+ type, NULL, flags, data, AA_MAY_MOUNT, &perms, info,
4144+ error);
4145+ kfree(buffer);
4146+ kfree(dev_buffer);
4147+
4148+out:
4149+ return error;
4150+
4151+}
4152+
4153+int aa_umount(struct aa_profile *profile, struct vfsmount *mnt, int flags)
4154+{
4155+ struct file_perms perms = { };
4156+ char *buffer = NULL;
4157+ const char *name, *info = NULL;
4158+ int error;
4159+
4160+ struct path path = { mnt, mnt->mnt_root };
4161+ error = aa_path_name(&path, path_flags(profile, &path), &buffer, &name,
4162+ &info);
4163+ if (error)
4164+ goto audit;
4165+
4166+ if (!error && profile->policy.dfa) {
4167+ unsigned int state;
4168+ state = aa_dfa_match(profile->policy.dfa,
4169+ profile->policy.start[AA_CLASS_MOUNT],
4170+ name);
4171+ perms = compute_mnt_perms(profile->policy.dfa, state);
4172+ }
4173+
4174+ if (AA_MAY_UMOUNT & ~perms.allow)
4175+ error = -EACCES;
4176+
4177+audit:
4178+ error = audit_mount(profile, GFP_KERNEL, OP_UMOUNT, name, NULL, NULL,
4179+ NULL, 0, NULL, AA_MAY_UMOUNT, &perms, info, error);
4180+ kfree(buffer);
4181+
4182+ return error;
4183+}
4184+
4185+int aa_pivotroot(struct aa_profile *profile, struct path *old_path,
4186+ struct path *new_path)
4187+{
4188+ struct file_perms perms = { };
4189+ struct aa_profile *target = NULL;
4190+ char *old_buffer = NULL, *new_buffer = NULL;
4191+ const char *old_name, *new_name = NULL, *info = NULL;
4192+ int error;
4193+
4194+ error = aa_path_name(old_path, path_flags(profile, old_path),
4195+ &old_buffer, &old_name, &info);
4196+ if (error)
4197+ goto audit;
4198+
4199+ error = aa_path_name(new_path, path_flags(profile, new_path),
4200+ &new_buffer, &new_name, &info);
4201+ if (error)
4202+ goto audit;
4203+
4204+ if (profile->policy.dfa) {
4205+ unsigned int state;
4206+ state = aa_dfa_match(profile->policy.dfa,
4207+ profile->policy.start[AA_CLASS_MOUNT],
4208+ new_name);
4209+ state = aa_dfa_null_transition(profile->policy.dfa, state);
4210+ state = aa_dfa_match(profile->policy.dfa, state, old_name);
4211+ perms = compute_mnt_perms(profile->policy.dfa, state);
4212+ }
4213+
4214+ if (AA_MAY_PIVOTROOT & perms.allow) {
4215+ if ((perms.xindex & AA_X_TYPE_MASK) == AA_X_TABLE) {
4216+ target = x_table_lookup(profile, perms.xindex);
4217+ if (!target)
4218+ error = -ENOENT;
4219+ else
4220+ error = aa_replace_current_profile(target);
4221+ }
4222+ } else
4223+ error = -EACCES;
4224+
4225+audit:
4226+ error = audit_mount(profile, GFP_KERNEL, OP_PIVOTROOT, new_name,
4227+ old_name, NULL, target ? target->base.name : NULL,
4228+ 0, NULL, AA_MAY_PIVOTROOT, &perms, info, error);
4229+ aa_put_profile(target);
4230+ kfree(old_buffer);
4231+ kfree(new_buffer);
4232+
4233+ return error;
4234+}