From 92f43c452c5313a7914eab2b08d966a6c5007baa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2015 00:05:14 +0200 Subject: [PATCH 001/734] kbuild/mkspec: Simplify vmlinux.bz2 creation No need for the intermediary vmlinux.orig - bzip2 can keep the original files used for compression with --keep. Signed-off-by: Borislav Petkov Signed-off-by: Michal Marek --- scripts/package/mkspec | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index d9ab94b17de0bc..89f9669d4f0070 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -111,10 +111,8 @@ echo 'cp System.map $RPM_BUILD_ROOT'"/boot/System.map-$KERNELRELEASE" echo 'cp .config $RPM_BUILD_ROOT'"/boot/config-$KERNELRELEASE" echo "%ifnarch ppc64" -echo 'cp vmlinux vmlinux.orig' -echo 'bzip2 -9 vmlinux' +echo 'bzip2 -9 --keep vmlinux' echo 'mv vmlinux.bz2 $RPM_BUILD_ROOT'"/boot/vmlinux-$KERNELRELEASE.bz2" -echo 'mv vmlinux.orig vmlinux' echo "%endif" if ! $PREBUILT; then From dca0c0246fb739bccdd19ff2bfd0f02ccffdb07c Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Thu, 16 Apr 2015 16:42:46 +0300 Subject: [PATCH 002/734] deb-pkg: move setting debarch for a separate function create_package() function tries to resolve used architecture for everry package. Split the setting the architecture to a new function, set_debarch(), called once on startup. This allows using debarch from other parts of script as needed. v2: Follow Michals suggestion on setting variables at top scope and also setting the fallback $debarch in the new function Signed-off-by: Riku Voipio Signed-off-by: Michal Marek --- scripts/package/builddeb | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 88dbf23b697082..fccabe5fb72bdc 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -25,8 +25,13 @@ create_package() { chown -R root:root "$pdir" chmod -R go-w "$pdir" + # Create the package + dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir" + dpkg --build "$pdir" .. +} + +set_debarch() { # Attempt to find the correct Debian architecture - local forcearch="" debarch="" case "$UTS_MACHINE" in i386|ia64|alpha) debarch="$UTS_MACHINE" ;; @@ -47,6 +52,7 @@ create_package() { arm*) debarch=arm$(grep -q CONFIG_AEABI=y $KCONFIG_CONFIG && echo el || true) ;; *) + debarch=$(dpkg --print-architecture) echo "" >&2 echo "** ** ** WARNING ** ** **" >&2 echo "" >&2 @@ -59,13 +65,8 @@ create_package() { if [ -n "$KBUILD_DEBARCH" ] ; then debarch="$KBUILD_DEBARCH" fi - if [ -n "$debarch" ] ; then - forcearch="-DArchitecture=$debarch" - fi + forcearch="-DArchitecture=$debarch" - # Create the package - dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch:-$(dpkg --print-architecture)}" -p$pname -P"$pdir" - dpkg --build "$pdir" .. } # Some variables and settings used throughout the script @@ -86,6 +87,9 @@ fwpackagename=linux-firmware-image-$version kernel_headers_packagename=linux-headers-$version libc_headers_packagename=linux-libc-dev dbg_packagename=$packagename-dbg +debarch= +forcearch= +set_debarch if [ "$ARCH" = "um" ] ; then packagename=user-mode-linux-$version From 64178cb62c329350fe06622cd215264d849b27b1 Mon Sep 17 00:00:00 2001 From: Andrey Skvortsov Date: Mon, 16 Mar 2015 11:20:54 +0300 Subject: [PATCH 003/734] builddeb: fix stripped module signatures if CONFIG_DEBUG_INFO and CONFIG_MODULE_SIG_ALL are set If CONFIG_MODULE_SIG_ALL is set, then user expects that all modules are automatically signed in the result package, as it's for rpm-pkg, binrpm-pkg, tar, tar-*. For deb-pkg this is correct only if CONFIG_DEBUG_INFO is NOT set. In that case deb-package contains signed modules. But if CONFIG_DEBUG_INFO is set, builddeb creates separate package with debug information. To do that, debug information from all modules is copied into separate files by objcopy. And loadable kernel modules are stripped afterwards. Stripping removes previously (during modules_install) added signatures from loadable kernel modules. Therefore final deb-package contains unsigned modules despite of set option CONFIG_MODULE_SIG_ALL. This patch resigns all stripped modules if CONFIG_MODULE_SIG_ALL is set to solve this problem. Signed-off-by: Andrey Skvortsov Acked-by: maximilian attems Signed-off-by: Michal Marek --- scripts/package/builddeb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index fccabe5fb72bdc..222770c1b77510 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -166,6 +166,12 @@ if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then # then add a link to those $OBJCOPY --add-gnu-debuglink=$dbg_dir/usr/lib/debug/$module $tmpdir/$module done + + # resign stripped modules + MODULE_SIG_ALL="$(grep -s '^CONFIG_MODULE_SIG_ALL=y' $KCONFIG_CONFIG || true)" + if [ -n "$MODULE_SIG_ALL" ]; then + INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_sign + fi fi fi From ca2a9d2cf6cf3dd852c3926ac7e30ee774da4638 Mon Sep 17 00:00:00 2001 From: "Arnaud Patard (Rtp)" Date: Tue, 3 Feb 2015 13:16:33 +0100 Subject: [PATCH 004/734] deb-pkg: Add device tree blobs to the package When building a package with make deb-pkg (say, for arm), the dtb files are not added to the package. Given that things are still evolving on arm, it make sense to have them along with the kernel and modules. Signed-off-by: Arnaud Patard Reviewed-by: Ben Hutchings Acked-by: maximilian attems Signed-off-by: Michal Marek --- scripts/package/builddeb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 222770c1b77510..d30116b57e7e48 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -147,6 +147,13 @@ else cp arch/$ARCH/boot/$KBUILD_IMAGE "$tmpdir/$installed_image_path" fi +if grep -q "^CONFIG_OF=y" $KCONFIG_CONFIG ; then + # Only some architectures with OF support have this target + if grep -q dtbs_install "${srctree}/arch/$SRCARCH/Makefile"; then + $MAKE KBUILD_SRC= INSTALL_DTBS_PATH="$tmpdir/usr/lib/$packagename" dtbs_install + fi +fi + if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_install rm -f "$tmpdir/lib/modules/$version/build" From f9beafc9d8bf7febf673df9b41e13596ca669f75 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:27 -0300 Subject: [PATCH 005/734] coccinelle: pm_runtime: Insert blank line Insert a blank line in order to improve the readability of the generated patch and also make it consistent with the other .cocci files. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/pm_runtime.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/api/pm_runtime.cocci b/scripts/coccinelle/api/pm_runtime.cocci index f01789e967ec4b..b7042d074078cb 100644 --- a/scripts/coccinelle/api/pm_runtime.cocci +++ b/scripts/coccinelle/api/pm_runtime.cocci @@ -1,5 +1,5 @@ /// Make sure pm_runtime_* calls does not use unnecessary IS_ERR_VALUE -// +/// // Keywords: pm_runtime // Confidence: Medium // Copyright (C) 2013 Texas Instruments Incorporated - GPLv2. From fe8c46b632505a880c527bc9ae246e868aa3ece5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:28 -0300 Subject: [PATCH 006/734] coccinelle: returnvar: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour." So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/returnvar.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/returnvar.cocci b/scripts/coccinelle/misc/returnvar.cocci index 605955a91c4493..d8286ef5307fca 100644 --- a/scripts/coccinelle/misc/returnvar.cocci +++ b/scripts/coccinelle/misc/returnvar.cocci @@ -1,5 +1,5 @@ /// -/// Removes unneeded variable used to store return value. +/// Remove unneeded variable used to store return value. /// // Confidence: Moderate // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6. GPLv2. From dd494ac0de48ded6a7ec0525f253116fde5c7be5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:29 -0300 Subject: [PATCH 007/734] coccinelle: ifaddr: Fix the sentence Make the sentence sensible. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/ifaddr.cocci | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/coccinelle/misc/ifaddr.cocci b/scripts/coccinelle/misc/ifaddr.cocci index 8aebd1875e7526..c2663c677ac1c9 100644 --- a/scripts/coccinelle/misc/ifaddr.cocci +++ b/scripts/coccinelle/misc/ifaddr.cocci @@ -1,5 +1,4 @@ -/// the address of a variable or field is non-zero is likely always to bo -/// non-zero +/// The address of a variable or field is likely always to be non-zero. /// // Confidence: High // Copyright: (C) 2012 Julia Lawall, INRIA/LIP6. GPLv2. From ca34cba43168830dd96f8f6407282131733e6fb4 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:30 -0300 Subject: [PATCH 008/734] coccinelle: simple_open: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour." So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/simple_open.cocci | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/coccinelle/api/simple_open.cocci b/scripts/coccinelle/api/simple_open.cocci index b67e174f3d95ef..bd1a2a4ee106d0 100644 --- a/scripts/coccinelle/api/simple_open.cocci +++ b/scripts/coccinelle/api/simple_open.cocci @@ -1,5 +1,5 @@ -/// This removes an open coded simple_open() function -/// and replaces file operations references to the function +/// Remove an open coded simple_open() function +/// and replace file operations references to the function /// with simple_open() instead. /// // Confidence: High From 4341f6e5ce448dd79c3e663513213b936ba34c83 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 May 2015 08:02:34 -0300 Subject: [PATCH 009/734] scripts/coccinelle/misc/semicolon.cocci: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour. So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/semicolon.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/semicolon.cocci b/scripts/coccinelle/misc/semicolon.cocci index a47eba2edc9e1e..6740c659a2b384 100644 --- a/scripts/coccinelle/misc/semicolon.cocci +++ b/scripts/coccinelle/misc/semicolon.cocci @@ -1,5 +1,5 @@ /// -/// Removes unneeded semicolon. +/// Remove unneeded semicolon. /// // Confidence: Moderate // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6. GPLv2. From 74de120d8096f72bdf95aba7234428c798d931cd Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 May 2015 08:02:35 -0300 Subject: [PATCH 010/734] scripts/coccinelle/misc/irqf_oneshot.cocci: Fix grammar Correct form is 'always requested'. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/irqf_oneshot.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/irqf_oneshot.cocci b/scripts/coccinelle/misc/irqf_oneshot.cocci index a24a754ae1d728..b17ac8b998947c 100644 --- a/scripts/coccinelle/misc/irqf_oneshot.cocci +++ b/scripts/coccinelle/misc/irqf_oneshot.cocci @@ -1,4 +1,4 @@ -/// Make sure threaded IRQs without a primary handler are always request with +/// Make sure threaded IRQs without a primary handler are always requested with /// IRQF_ONESHOT /// // From 4c8f20bb8e0ba6eecf62958bbf0502a2dc445ce6 Mon Sep 17 00:00:00 2001 From: Dmitry Kalinkin Date: Thu, 21 May 2015 19:19:13 +0800 Subject: [PATCH 011/734] coccinelle: api: add vma_pages.cocci This semantic patch replaces explicit computations of vma page count with explicit function call. Signed-off-by: Dmitry Kalinkin Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/vma_pages.cocci | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/coccinelle/api/vma_pages.cocci diff --git a/scripts/coccinelle/api/vma_pages.cocci b/scripts/coccinelle/api/vma_pages.cocci new file mode 100644 index 00000000000000..3e52e11ea1dc8a --- /dev/null +++ b/scripts/coccinelle/api/vma_pages.cocci @@ -0,0 +1,60 @@ +/// +/// Use vma_pages function on vma object instead of explicit computation. +/// +// Confidence: High +// Keywords: vma_pages vma +// Comment: Based on resource_size.cocci + +virtual context +virtual patch +virtual org +virtual report + +//---------------------------------------------------------- +// For context mode +//---------------------------------------------------------- + +@r_context depends on context && !patch && !org && !report@ +struct vm_area_struct *vma; +@@ + +* (vma->vm_end - vma->vm_start) >> PAGE_SHIFT + +//---------------------------------------------------------- +// For patch mode +//---------------------------------------------------------- + +@r_patch depends on !context && patch && !org && !report@ +struct vm_area_struct *vma; +@@ + +- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) ++ vma_pages(vma) + +//---------------------------------------------------------- +// For org mode +//---------------------------------------------------------- + +@r_org depends on !context && !patch && (org || report)@ +struct vm_area_struct *vma; +position p; +@@ + + (vma->vm_end@p - vma->vm_start) >> PAGE_SHIFT + +@script:python depends on report@ +p << r_org.p; +x << r_org.vma; +@@ + +msg="WARNING: Consider using vma_pages helper on %s" % (x) +coccilib.report.print_report(p[0], msg) + +@script:python depends on org@ +p << r_org.p; +x << r_org.vma; +@@ + +msg="WARNING: Consider using vma_pages helper on %s" % (x) +msg_safe=msg.replace("[","@(").replace("]",")") +coccilib.org.print_todo(p[0], msg_safe) From 9473a62f779d78bae646e7ef1a792d53ad4ac29e Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sun, 24 May 2015 17:45:54 -0300 Subject: [PATCH 012/734] coccinelle: irqf_oneshot.cocci: Improve the generated commit log Improve the commit log of the generated patch by mentioning the commit log that makes threaded IRQs without a primary handler to be requested with the IRQF_ONESHOT flag. Signed-off-by: Fabio Estevam Acked- by: Valentin Rothberg Signed-off-by: Michal Marek --- scripts/coccinelle/misc/irqf_oneshot.cocci | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/coccinelle/misc/irqf_oneshot.cocci b/scripts/coccinelle/misc/irqf_oneshot.cocci index b17ac8b998947c..b421150a2effcd 100644 --- a/scripts/coccinelle/misc/irqf_oneshot.cocci +++ b/scripts/coccinelle/misc/irqf_oneshot.cocci @@ -1,5 +1,8 @@ -/// Make sure threaded IRQs without a primary handler are always requested with -/// IRQF_ONESHOT +/// Since commit 1c6c69525b40 ("genirq: Reject bogus threaded irq requests") +/// threaded IRQs without a primary handler need to be requested with +/// IRQF_ONESHOT, otherwise the request will fail. +/// +/// So pass the IRQF_ONESHOT flag in this case. /// // // Confidence: Good From f94c56f4f33dd34551af6bcc1afde5082fdf6e86 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 1 Jun 2015 22:52:20 -0300 Subject: [PATCH 013/734] coccinelle: simple_return: Add a blank line Insert a blank line in order to improve the readability of the generated patch and also make it consistent with the other .cocci files. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/simple_return.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/simple_return.cocci b/scripts/coccinelle/misc/simple_return.cocci index 47f7084b6360a0..e8b6313b116f57 100644 --- a/scripts/coccinelle/misc/simple_return.cocci +++ b/scripts/coccinelle/misc/simple_return.cocci @@ -1,6 +1,6 @@ /// Simplify a trivial if-return sequence. Possibly combine with a /// preceding function call. -// +/// // Confidence: High // Copyright: (C) 2014 Julia Lawall, INRIA/LIP6. GPLv2. // Copyright: (C) 2014 Gilles Muller, INRIA/LiP6. GPLv2. From d0fe116b4554d79125f384f7ba23722b41c3cb93 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 24 Apr 2015 10:27:40 -0700 Subject: [PATCH 014/734] gitignore: Add MIPS vmlinux.32 to the list MIPS64 kernels builds will produce a vmlinux.32 kernel image for compatibility, ignore them. Signed-off-by: Florian Fainelli Signed-off-by: Michal Marek --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4ad4a98b884b9c..34d6bad9317b36 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ Module.symvers /TAGS /linux /vmlinux +/vmlinux.32 /vmlinux-gdb.py /vmlinuz /System.map From a37161c0588c0d3ff4afb08ef83106a80bde604e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 16 Apr 2015 14:02:41 -0700 Subject: [PATCH 015/734] Kbuild: Add ID files to .gitignore I use GNU id-utils to find code (essentially a database backed grep), which generates an ID file to maintain its data. Add ID to the .gitignore file. Signed-off-by: Andi Kleen Signed-off-by: Michal Marek --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 34d6bad9317b36..98b91fccff45fc 100644 --- a/.gitignore +++ b/.gitignore @@ -90,6 +90,9 @@ GRTAGS GSYMS GTAGS +# id-utils files +ID + *.orig *~ \#*# From 21a59991ce0cd9a0b54b135305e3fcf880f2aaf1 Mon Sep 17 00:00:00 2001 From: Jim Davis Date: Mon, 8 Jun 2015 13:19:08 -0700 Subject: [PATCH 016/734] scripts/package/Makefile: rpmbuild is needed for rpm targets Before rpm release 4.1, in 2002, either the rpm command or the rpmbuild command could be used in the rpm-pkg or binrpm-pkg targets, and the Makefile chose the rpm command if the rpmbuild command wasn't found. After release 4.1, however, the rpm command could no longer be used in place of the rpmbuild command. As the rpmbuild command is not installed by default, this can lead to failures with the rpm-pkg and binrpm-pkg targets: rpm --define "_builddir ." --target \ x86_64 -bb ./binkernel.spec rpm --target: unknown option scripts/package/Makefile:60: recipe for target 'binrpm-pkg' failed Change the Makefile to use rpmbuild unconditionally to avoid this. Signed-off-by: Jim Davis Signed-off-by: Michal Marek --- scripts/package/Makefile | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/package/Makefile b/scripts/package/Makefile index 99ca6e76eb0a53..8b11d5adec7f58 100644 --- a/scripts/package/Makefile +++ b/scripts/package/Makefile @@ -21,10 +21,6 @@ # Note that the rpm-pkg target cannot be used with KBUILD_OUTPUT, # but the binrpm-pkg target can; for some reason O= gets ignored. -# Do we have rpmbuild, otherwise fall back to the older rpm -RPM := $(shell if [ -x "/usr/bin/rpmbuild" ]; then echo rpmbuild; \ - else echo rpm; fi) - # Remove hyphens since they have special meaning in RPM filenames KERNELPATH := kernel-$(subst -,_,$(KERNELRELEASE)) # Include only those top-level files that are needed by make, plus the GPL copy @@ -51,7 +47,7 @@ rpm-pkg rpm: FORCE rm -f $(objtree)/.scmversion $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - $(RPM) $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz + rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz rm $(KERNELPATH).tar.gz kernel.spec # binrpm-pkg @@ -62,7 +58,7 @@ binrpm-pkg: FORCE $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - $(RPM) $(RPMOPTS) --define "_builddir $(objtree)" --target \ + rpmbuild --define "_builddir $(objtree)" --target \ $(UTS_MACHINE) -bb $(objtree)/binkernel.spec rm binkernel.spec From 04dc91ce2cca5927159c689aa1f47663f8c51530 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 13 Jul 2015 12:26:44 +0200 Subject: [PATCH 017/734] regmap: Add better support for devices without readback support Currently regmap requires that a reg_read callback is supplied, otherwise a warning is emitted each time regmap_read() is called. This means a device or bus without readback support needs to supply dummy reg_read callback. Apart from that regmap_read() will still work fine if a cache is used. Remove the warning and let regmap_readable() return false if not reg_read callback is supplied. This means a device no longer has to supply a dummy callback if it does not support readback and it also doesn't have to have a readable_reg callback that always returns false since this is now implicit. Signed-off-by: Lars-Peter Clausen Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 7111d04f26218b..8894b992043e04 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -93,6 +93,9 @@ bool regmap_writeable(struct regmap *map, unsigned int reg) bool regmap_readable(struct regmap *map, unsigned int reg) { + if (!map->reg_read) + return false; + if (map->max_register && reg > map->max_register) return false; @@ -2097,8 +2100,6 @@ static int _regmap_read(struct regmap *map, unsigned int reg, int ret; void *context = _regmap_map_get_context(map); - WARN_ON(!map->reg_read); - if (!map->cache_bypass) { ret = regcache_read(map, reg, val); if (ret == 0) From 671a2781ff01abf4fdc8904881fc3abd3a8279af Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 10 Jul 2015 17:19:55 -0400 Subject: [PATCH 018/734] security: add ioctl specific auditing to lsm_audit Add information about ioctl calls to the LSM audit data. Log the file path and command number. Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 7 +++++++ security/lsm_audit.c | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 1cc89e9df480cc..ffb9c9da4f39f8 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -40,6 +40,11 @@ struct lsm_network_audit { } fam; }; +struct lsm_ioctlop_audit { + struct path path; + u16 cmd; +}; + /* Auxiliary data to use in generating the audit record. */ struct common_audit_data { char type; @@ -53,6 +58,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_KMOD 8 #define LSM_AUDIT_DATA_INODE 9 #define LSM_AUDIT_DATA_DENTRY 10 +#define LSM_AUDIT_DATA_IOCTL_OP 11 union { struct path path; struct dentry *dentry; @@ -68,6 +74,7 @@ struct common_audit_data { } key_struct; #endif char *kmod_name; + struct lsm_ioctlop_audit *op; } u; /* this union contains LSM specific data */ union { diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 1d34277dc402b5..9f6c649c65e92a 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -245,6 +245,21 @@ static void dump_common_audit_data(struct audit_buffer *ab, } break; } + case LSM_AUDIT_DATA_IOCTL_OP: { + struct inode *inode; + + audit_log_d_path(ab, " path=", &a->u.op->path); + + inode = a->u.op->path.dentry->d_inode; + if (inode) { + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); + } + + audit_log_format(ab, " ioctlcmd=%hx", a->u.op->cmd); + break; + } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; From fa1aa143ac4a682c7f5fd52a3cf05f5a6fe44a0a Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 10 Jul 2015 17:19:56 -0400 Subject: [PATCH 019/734] selinux: extended permissions for ioctls Add extended permissions logic to selinux. Extended permissions provides additional permissions in 256 bit increments. Extend the generic ioctl permission check to use the extended permissions for per-command filtering. Source/target/class sets including the ioctl permission may additionally include a set of commands. Example: allowxperm : ioctl unpriv_app_socket_cmds auditallowxperm : ioctl priv_gpu_cmds Where unpriv_app_socket_cmds and priv_gpu_cmds are macros representing commonly granted sets of ioctl commands. When ioctl commands are omitted only the permissions are checked. This feature is intended to provide finer granularity for the ioctl permission that may be too imprecise. For example, the same driver may use ioctls to provide important and benign functionality such as driver version or socket type as well as dangerous capabilities such as debugging features, read/write/execute to physical memory or access to sensitive data. Per-command filtering provides a mechanism to reduce the attack surface of the kernel, and limit applications to the subset of commands required. The format of the policy binary has been modified to include ioctl commands, and the policy version number has been incremented to POLICYDB_VERSION_XPERMS_IOCTL=30 to account for the format change. The extended permissions logic is deliberately generic to allow components to be reused e.g. netlink filters Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich Signed-off-by: Paul Moore --- security/selinux/avc.c | 415 +++++++++++++++++++++++++++- security/selinux/hooks.c | 42 ++- security/selinux/include/avc.h | 6 + security/selinux/include/security.h | 32 ++- security/selinux/ss/avtab.c | 104 ++++++- security/selinux/ss/avtab.h | 33 ++- security/selinux/ss/conditional.c | 32 ++- security/selinux/ss/conditional.h | 6 +- security/selinux/ss/policydb.c | 5 + security/selinux/ss/services.c | 213 ++++++++++++-- security/selinux/ss/services.h | 6 + 11 files changed, 834 insertions(+), 60 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 3c17dda9571d4e..2d5e1b04cd5029 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ struct avc_entry { u32 tsid; u16 tclass; struct av_decision avd; + struct avc_xperms_node *xp_node; }; struct avc_node { @@ -56,6 +58,16 @@ struct avc_node { struct rcu_head rhead; }; +struct avc_xperms_decision_node { + struct extended_perms_decision xpd; + struct list_head xpd_list; /* list of extended_perms_decision */ +}; + +struct avc_xperms_node { + struct extended_perms xp; + struct list_head xpd_head; /* list head of extended_perms_decision */ +}; + struct avc_cache { struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */ spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */ @@ -80,6 +92,9 @@ DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 }; static struct avc_cache avc_cache; static struct avc_callback_node *avc_callbacks; static struct kmem_cache *avc_node_cachep; +static struct kmem_cache *avc_xperms_data_cachep; +static struct kmem_cache *avc_xperms_decision_cachep; +static struct kmem_cache *avc_xperms_cachep; static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) { @@ -170,7 +185,17 @@ void __init avc_init(void) atomic_set(&avc_cache.lru_hint, 0); avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC, NULL); + avc_xperms_cachep = kmem_cache_create("avc_xperms_node", + sizeof(struct avc_xperms_node), + 0, SLAB_PANIC, NULL); + avc_xperms_decision_cachep = kmem_cache_create( + "avc_xperms_decision_node", + sizeof(struct avc_xperms_decision_node), + 0, SLAB_PANIC, NULL); + avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data", + sizeof(struct extended_perms_data), + 0, SLAB_PANIC, NULL); audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n"); } @@ -205,9 +230,261 @@ int avc_get_hash_stats(char *page) slots_used, AVC_CACHE_SLOTS, max_chain_len); } +/* + * using a linked list for extended_perms_decision lookup because the list is + * always small. i.e. less than 5, typically 1 + */ +static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, + struct avc_xperms_node *xp_node) +{ + struct avc_xperms_decision_node *xpd_node; + + list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { + if (xpd_node->xpd.driver == driver) + return &xpd_node->xpd; + } + return NULL; +} + +static inline unsigned int +avc_xperms_has_perm(struct extended_perms_decision *xpd, + u8 perm, u8 which) +{ + unsigned int rc = 0; + + if ((which == XPERMS_ALLOWED) && + (xpd->used & XPERMS_ALLOWED)) + rc = security_xperm_test(xpd->allowed->p, perm); + else if ((which == XPERMS_AUDITALLOW) && + (xpd->used & XPERMS_AUDITALLOW)) + rc = security_xperm_test(xpd->auditallow->p, perm); + else if ((which == XPERMS_DONTAUDIT) && + (xpd->used & XPERMS_DONTAUDIT)) + rc = security_xperm_test(xpd->dontaudit->p, perm); + return rc; +} + +static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, + u8 driver, u8 perm) +{ + struct extended_perms_decision *xpd; + security_xperm_set(xp_node->xp.drivers.p, driver); + xpd = avc_xperms_decision_lookup(driver, xp_node); + if (xpd && xpd->allowed) + security_xperm_set(xpd->allowed->p, perm); +} + +static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node) +{ + struct extended_perms_decision *xpd; + + xpd = &xpd_node->xpd; + if (xpd->allowed) + kmem_cache_free(avc_xperms_data_cachep, xpd->allowed); + if (xpd->auditallow) + kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow); + if (xpd->dontaudit) + kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit); + kmem_cache_free(avc_xperms_decision_cachep, xpd_node); +} + +static void avc_xperms_free(struct avc_xperms_node *xp_node) +{ + struct avc_xperms_decision_node *xpd_node, *tmp; + + if (!xp_node) + return; + + list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) { + list_del(&xpd_node->xpd_list); + avc_xperms_decision_free(xpd_node); + } + kmem_cache_free(avc_xperms_cachep, xp_node); +} + +static void avc_copy_xperms_decision(struct extended_perms_decision *dest, + struct extended_perms_decision *src) +{ + dest->driver = src->driver; + dest->used = src->used; + if (dest->used & XPERMS_ALLOWED) + memcpy(dest->allowed->p, src->allowed->p, + sizeof(src->allowed->p)); + if (dest->used & XPERMS_AUDITALLOW) + memcpy(dest->auditallow->p, src->auditallow->p, + sizeof(src->auditallow->p)); + if (dest->used & XPERMS_DONTAUDIT) + memcpy(dest->dontaudit->p, src->dontaudit->p, + sizeof(src->dontaudit->p)); +} + +/* + * similar to avc_copy_xperms_decision, but only copy decision + * information relevant to this perm + */ +static inline void avc_quick_copy_xperms_decision(u8 perm, + struct extended_perms_decision *dest, + struct extended_perms_decision *src) +{ + /* + * compute index of the u32 of the 256 bits (8 u32s) that contain this + * command permission + */ + u8 i = perm >> 5; + + dest->used = src->used; + if (dest->used & XPERMS_ALLOWED) + dest->allowed->p[i] = src->allowed->p[i]; + if (dest->used & XPERMS_AUDITALLOW) + dest->auditallow->p[i] = src->auditallow->p[i]; + if (dest->used & XPERMS_DONTAUDIT) + dest->dontaudit->p[i] = src->dontaudit->p[i]; +} + +static struct avc_xperms_decision_node + *avc_xperms_decision_alloc(u8 which) +{ + struct avc_xperms_decision_node *xpd_node; + struct extended_perms_decision *xpd; + + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd_node) + return NULL; + + xpd = &xpd_node->xpd; + if (which & XPERMS_ALLOWED) { + xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->allowed) + goto error; + } + if (which & XPERMS_AUDITALLOW) { + xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->auditallow) + goto error; + } + if (which & XPERMS_DONTAUDIT) { + xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->dontaudit) + goto error; + } + return xpd_node; +error: + avc_xperms_decision_free(xpd_node); + return NULL; +} + +static int avc_add_xperms_decision(struct avc_node *node, + struct extended_perms_decision *src) +{ + struct avc_xperms_decision_node *dest_xpd; + + node->ae.xp_node->xp.len++; + dest_xpd = avc_xperms_decision_alloc(src->used); + if (!dest_xpd) + return -ENOMEM; + avc_copy_xperms_decision(&dest_xpd->xpd, src); + list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); + return 0; +} + +static struct avc_xperms_node *avc_xperms_alloc(void) +{ + struct avc_xperms_node *xp_node; + + xp_node = kmem_cache_zalloc(avc_xperms_cachep, + GFP_ATOMIC|__GFP_NOMEMALLOC); + if (!xp_node) + return xp_node; + INIT_LIST_HEAD(&xp_node->xpd_head); + return xp_node; +} + +static int avc_xperms_populate(struct avc_node *node, + struct avc_xperms_node *src) +{ + struct avc_xperms_node *dest; + struct avc_xperms_decision_node *dest_xpd; + struct avc_xperms_decision_node *src_xpd; + + if (src->xp.len == 0) + return 0; + dest = avc_xperms_alloc(); + if (!dest) + return -ENOMEM; + + memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); + dest->xp.len = src->xp.len; + + /* for each source xpd allocate a destination xpd and copy */ + list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { + dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used); + if (!dest_xpd) + goto error; + avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd); + list_add(&dest_xpd->xpd_list, &dest->xpd_head); + } + node->ae.xp_node = dest; + return 0; +error: + avc_xperms_free(dest); + return -ENOMEM; + +} + +static inline u32 avc_xperms_audit_required(u32 requested, + struct av_decision *avd, + struct extended_perms_decision *xpd, + u8 perm, + int result, + u32 *deniedp) +{ + u32 denied, audited; + + denied = requested & ~avd->allowed; + if (unlikely(denied)) { + audited = denied & avd->auditdeny; + if (audited && xpd) { + if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT)) + audited &= ~requested; + } + } else if (result) { + audited = denied = requested; + } else { + audited = requested & avd->auditallow; + if (audited && xpd) { + if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW)) + audited &= ~requested; + } + } + + *deniedp = denied; + return audited; +} + +static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass, + u32 requested, struct av_decision *avd, + struct extended_perms_decision *xpd, + u8 perm, int result, + struct common_audit_data *ad) +{ + u32 audited, denied; + + audited = avc_xperms_audit_required( + requested, avd, xpd, perm, result, &denied); + if (likely(!audited)) + return 0; + return slow_avc_audit(ssid, tsid, tclass, requested, + audited, denied, result, ad, 0); +} + static void avc_node_free(struct rcu_head *rhead) { struct avc_node *node = container_of(rhead, struct avc_node, rhead); + avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); } @@ -221,6 +498,7 @@ static void avc_node_delete(struct avc_node *node) static void avc_node_kill(struct avc_node *node) { + avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); atomic_dec(&avc_cache.active_nodes); @@ -367,6 +645,7 @@ static int avc_latest_notif_update(int seqno, int is_insert) * @tsid: target security identifier * @tclass: target security class * @avd: resulting av decision + * @xp_node: resulting extended permissions * * Insert an AVC entry for the SID pair * (@ssid, @tsid) and class @tclass. @@ -378,7 +657,9 @@ static int avc_latest_notif_update(int seqno, int is_insert) * the access vectors into a cache entry, returns * avc_node inserted. Otherwise, this function returns NULL. */ -static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) +static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, + struct av_decision *avd, + struct avc_xperms_node *xp_node) { struct avc_node *pos, *node = NULL; int hvalue; @@ -391,10 +672,15 @@ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_dec if (node) { struct hlist_head *head; spinlock_t *lock; + int rc = 0; hvalue = avc_hash(ssid, tsid, tclass); avc_node_populate(node, ssid, tsid, tclass, avd); - + rc = avc_xperms_populate(node, xp_node); + if (rc) { + kmem_cache_free(avc_node_cachep, node); + return NULL; + } head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; @@ -523,14 +809,17 @@ int __init avc_add_callback(int (*callback)(u32 event), u32 events) * @perms : Permission mask bits * @ssid,@tsid,@tclass : identifier of an AVC entry * @seqno : sequence number when decision was made + * @xpd: extended_perms_decision to be added to the node * * if a valid AVC entry doesn't exist,this function returns -ENOENT. * if kmalloc() called internal returns NULL, this function returns -ENOMEM. * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, - u32 seqno) +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, + u32 tsid, u16 tclass, u32 seqno, + struct extended_perms_decision *xpd, + u32 flags) { int hvalue, rc = 0; unsigned long flag; @@ -574,9 +863,19 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd); + if (orig->ae.xp_node) { + rc = avc_xperms_populate(node, orig->ae.xp_node); + if (rc) { + kmem_cache_free(avc_node_cachep, node); + goto out_unlock; + } + } + switch (event) { case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; + if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) + avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: @@ -594,6 +893,9 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, case AVC_CALLBACK_AUDITDENY_DISABLE: node->ae.avd.auditdeny &= ~perms; break; + case AVC_CALLBACK_ADD_XPERMS: + avc_add_xperms_decision(node, xpd); + break; } avc_node_replace(node, orig); out_unlock: @@ -665,18 +967,20 @@ int avc_ss_reset(u32 seqno) * results in a bigger stack frame. */ static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid, - u16 tclass, struct av_decision *avd) + u16 tclass, struct av_decision *avd, + struct avc_xperms_node *xp_node) { rcu_read_unlock(); - security_compute_av(ssid, tsid, tclass, avd); + INIT_LIST_HEAD(&xp_node->xpd_head); + security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp); rcu_read_lock(); - return avc_insert(ssid, tsid, tclass, avd); + return avc_insert(ssid, tsid, tclass, avd, xp_node); } static noinline int avc_denied(u32 ssid, u32 tsid, - u16 tclass, u32 requested, - unsigned flags, - struct av_decision *avd) + u16 tclass, u32 requested, + u8 driver, u8 xperm, unsigned flags, + struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; @@ -684,11 +988,91 @@ static noinline int avc_denied(u32 ssid, u32 tsid, if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(AVC_CALLBACK_GRANT, requested, ssid, - tsid, tclass, avd->seqno); + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid, + tsid, tclass, avd->seqno, NULL, flags); return 0; } +/* + * The avc extended permissions logic adds an additional 256 bits of + * permissions to an avc node when extended permissions for that node are + * specified in the avtab. If the additional 256 permissions is not adequate, + * as-is the case with ioctls, then multiple may be chained together and the + * driver field is used to specify which set contains the permission. + */ +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 xperm, struct common_audit_data *ad) +{ + struct avc_node *node; + struct av_decision avd; + u32 denied; + struct extended_perms_decision local_xpd; + struct extended_perms_decision *xpd = NULL; + struct extended_perms_data allowed; + struct extended_perms_data auditallow; + struct extended_perms_data dontaudit; + struct avc_xperms_node local_xp_node; + struct avc_xperms_node *xp_node; + int rc = 0, rc2; + + xp_node = &local_xp_node; + BUG_ON(!requested); + + rcu_read_lock(); + + node = avc_lookup(ssid, tsid, tclass); + if (unlikely(!node)) { + node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node); + } else { + memcpy(&avd, &node->ae.avd, sizeof(avd)); + xp_node = node->ae.xp_node; + } + /* if extended permissions are not defined, only consider av_decision */ + if (!xp_node || !xp_node->xp.len) + goto decision; + + local_xpd.allowed = &allowed; + local_xpd.auditallow = &auditallow; + local_xpd.dontaudit = &dontaudit; + + xpd = avc_xperms_decision_lookup(driver, xp_node); + if (unlikely(!xpd)) { + /* + * Compute the extended_perms_decision only if the driver + * is flagged + */ + if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { + avd.allowed &= ~requested; + goto decision; + } + rcu_read_unlock(); + security_compute_xperms_decision(ssid, tsid, tclass, driver, + &local_xpd); + rcu_read_lock(); + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm, + ssid, tsid, tclass, avd.seqno, &local_xpd, 0); + } else { + avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); + } + xpd = &local_xpd; + + if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED)) + avd.allowed &= ~requested; + +decision: + denied = requested & ~(avd.allowed); + if (unlikely(denied)) + rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm, + AVC_EXTENDED_PERMS, &avd); + + rcu_read_unlock(); + + rc2 = avc_xperms_audit(ssid, tsid, tclass, requested, + &avd, xpd, xperm, rc, ad); + if (rc2) + return rc2; + return rc; +} /** * avc_has_perm_noaudit - Check permissions but perform no auditing. @@ -716,6 +1100,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, struct av_decision *avd) { struct avc_node *node; + struct avc_xperms_node xp_node; int rc = 0; u32 denied; @@ -725,13 +1110,13 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, node = avc_lookup(ssid, tsid, tclass); if (unlikely(!node)) - node = avc_compute_av(ssid, tsid, tclass, avd); + node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node); else memcpy(avd, &node->ae.avd, sizeof(*avd)); denied = requested & ~(avd->allowed); if (unlikely(denied)) - rc = avc_denied(ssid, tsid, tclass, requested, flags, avd); + rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd); rcu_read_unlock(); return rc; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 692e3cc8ce2393..a049b72162707a 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3216,6 +3216,46 @@ static void selinux_file_free_security(struct file *file) file_free_security(file); } +/* + * Check whether a task has the ioctl permission and cmd + * operation to an inode. + */ +int ioctl_has_perm(const struct cred *cred, struct file *file, + u32 requested, u16 cmd) +{ + struct common_audit_data ad; + struct file_security_struct *fsec = file->f_security; + struct inode *inode = file_inode(file); + struct inode_security_struct *isec = inode->i_security; + struct lsm_ioctlop_audit ioctl; + u32 ssid = cred_sid(cred); + int rc; + u8 driver = cmd >> 8; + u8 xperm = cmd & 0xff; + + ad.type = LSM_AUDIT_DATA_IOCTL_OP; + ad.u.op = &ioctl; + ad.u.op->cmd = cmd; + ad.u.op->path = file->f_path; + + if (ssid != fsec->sid) { + rc = avc_has_perm(ssid, fsec->sid, + SECCLASS_FD, + FD__USE, + &ad); + if (rc) + goto out; + } + + if (unlikely(IS_PRIVATE(inode))) + return 0; + + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, + requested, driver, xperm, &ad); +out: + return rc; +} + static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3258,7 +3298,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd, * to the file's ioctl() function. */ default: - error = file_has_perm(cred, file, FILE__IOCTL); + error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index ddf8eec03f2117..db12ff14277b24 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -142,6 +142,7 @@ static inline int avc_audit(u32 ssid, u32 tsid, } #define AVC_STRICT 1 /* Ignore permissive mode. */ +#define AVC_EXTENDED_PERMS 2 /* update extended permissions */ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, @@ -151,6 +152,10 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 perm, struct common_audit_data *ad); + + u32 avc_policy_seqno(void); #define AVC_CALLBACK_GRANT 1 @@ -161,6 +166,7 @@ u32 avc_policy_seqno(void); #define AVC_CALLBACK_AUDITALLOW_DISABLE 32 #define AVC_CALLBACK_AUDITDENY_ENABLE 64 #define AVC_CALLBACK_AUDITDENY_DISABLE 128 +#define AVC_CALLBACK_ADD_XPERMS 256 int avc_add_callback(int (*callback)(u32 event), u32 events); diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 36993ad1c067a8..6a681d26bf20a6 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -35,13 +35,14 @@ #define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS 27 #define POLICYDB_VERSION_DEFAULT_TYPE 28 #define POLICYDB_VERSION_CONSTRAINT_NAMES 29 +#define POLICYDB_VERSION_XPERMS_IOCTL 30 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_CONSTRAINT_NAMES +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_XPERMS_IOCTL #endif /* Mask for just the mount related flags */ @@ -109,11 +110,38 @@ struct av_decision { u32 flags; }; +#define XPERMS_ALLOWED 1 +#define XPERMS_AUDITALLOW 2 +#define XPERMS_DONTAUDIT 4 + +#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f)) +#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f))) +struct extended_perms_data { + u32 p[8]; +}; + +struct extended_perms_decision { + u8 used; + u8 driver; + struct extended_perms_data *allowed; + struct extended_perms_data *auditallow; + struct extended_perms_data *dontaudit; +}; + +struct extended_perms { + u16 len; /* length associated decision chain */ + struct extended_perms_data drivers; /* flag drivers that are used */ +}; + /* definitions of av_decision.flags */ #define AVD_FLAGS_PERMISSIVE 0x0001 void security_compute_av(u32 ssid, u32 tsid, - u16 tclass, struct av_decision *avd); + u16 tclass, struct av_decision *avd, + struct extended_perms *xperms); + +void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, + u8 driver, struct extended_perms_decision *xpermd); void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd); diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index b64f2772b03019..3628d3a868b669 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -24,6 +24,7 @@ #include "policydb.h" static struct kmem_cache *avtab_node_cachep; +static struct kmem_cache *avtab_xperms_cachep; /* Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. @@ -70,11 +71,24 @@ avtab_insert_node(struct avtab *h, int hvalue, struct avtab_key *key, struct avtab_datum *datum) { struct avtab_node *newnode; + struct avtab_extended_perms *xperms; newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; newnode->key = *key; - newnode->datum = *datum; + + if (key->specified & AVTAB_XPERMS) { + xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL); + if (xperms == NULL) { + kmem_cache_free(avtab_node_cachep, newnode); + return NULL; + } + *xperms = *(datum->u.xperms); + newnode->datum.u.xperms = xperms; + } else { + newnode->datum.u.data = datum->u.data; + } + if (prev) { newnode->next = prev->next; prev->next = newnode; @@ -107,8 +121,12 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && - (specified & cur->key.specified)) + (specified & cur->key.specified)) { + /* extended perms may not be unique */ + if (specified & AVTAB_XPERMS) + break; return -EEXIST; + } if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && @@ -271,6 +289,9 @@ void avtab_destroy(struct avtab *h) while (cur) { temp = cur; cur = cur->next; + if (temp->key.specified & AVTAB_XPERMS) + kmem_cache_free(avtab_xperms_cachep, + temp->datum.u.xperms); kmem_cache_free(avtab_node_cachep, temp); } } @@ -359,7 +380,10 @@ static uint16_t spec_order[] = { AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, - AVTAB_MEMBER + AVTAB_MEMBER, + AVTAB_XPERMS_ALLOWED, + AVTAB_XPERMS_AUDITALLOW, + AVTAB_XPERMS_DONTAUDIT }; int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, @@ -369,10 +393,11 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, { __le16 buf16[4]; u16 enabled; - __le32 buf32[7]; u32 items, items2, val, vers = pol->policyvers; struct avtab_key key; struct avtab_datum datum; + struct avtab_extended_perms xperms; + __le32 buf32[ARRAY_SIZE(xperms.perms.p)]; int i, rc; unsigned set; @@ -429,11 +454,15 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, printk(KERN_ERR "SELinux: avtab: entry has both access vectors and types\n"); return -EINVAL; } + if (val & AVTAB_XPERMS) { + printk(KERN_ERR "SELinux: avtab: entry has extended permissions\n"); + return -EINVAL; + } for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (val & spec_order[i]) { key.specified = spec_order[i] | enabled; - datum.data = le32_to_cpu(buf32[items++]); + datum.u.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if (rc) return rc; @@ -476,14 +505,42 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, return -EINVAL; } - rc = next_entry(buf32, fp, sizeof(u32)); - if (rc) { - printk(KERN_ERR "SELinux: avtab: truncated entry\n"); - return rc; + if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) && + (key.specified & AVTAB_XPERMS)) { + printk(KERN_ERR "SELinux: avtab: policy version %u does not " + "support extended permissions rules and one " + "was specified\n", vers); + return -EINVAL; + } else if (key.specified & AVTAB_XPERMS) { + memset(&xperms, 0, sizeof(struct avtab_extended_perms)); + rc = next_entry(&xperms.specified, fp, sizeof(u8)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + rc = next_entry(&xperms.driver, fp, sizeof(u8)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++) + xperms.perms.p[i] = le32_to_cpu(buf32[i]); + datum.u.xperms = &xperms; + } else { + rc = next_entry(buf32, fp, sizeof(u32)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + datum.u.data = le32_to_cpu(*buf32); } - datum.data = le32_to_cpu(*buf32); if ((key.specified & AVTAB_TYPE) && - !policydb_type_isvalid(pol, datum.data)) { + !policydb_type_isvalid(pol, datum.u.data)) { printk(KERN_ERR "SELinux: avtab: invalid type\n"); return -EINVAL; } @@ -543,8 +600,9 @@ int avtab_read(struct avtab *a, void *fp, struct policydb *pol) int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) { __le16 buf16[4]; - __le32 buf32[1]; + __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)]; int rc; + unsigned int i; buf16[0] = cpu_to_le16(cur->key.source_type); buf16[1] = cpu_to_le16(cur->key.target_type); @@ -553,8 +611,22 @@ int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) rc = put_entry(buf16, sizeof(u16), 4, fp); if (rc) return rc; - buf32[0] = cpu_to_le32(cur->datum.data); - rc = put_entry(buf32, sizeof(u32), 1, fp); + + if (cur->key.specified & AVTAB_XPERMS) { + rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp); + if (rc) + return rc; + rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp); + if (rc) + return rc; + for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++) + buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]); + rc = put_entry(buf32, sizeof(u32), + ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp); + } else { + buf32[0] = cpu_to_le32(cur->datum.u.data); + rc = put_entry(buf32, sizeof(u32), 1, fp); + } if (rc) return rc; return 0; @@ -588,9 +660,13 @@ void avtab_cache_init(void) avtab_node_cachep = kmem_cache_create("avtab_node", sizeof(struct avtab_node), 0, SLAB_PANIC, NULL); + avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms", + sizeof(struct avtab_extended_perms), + 0, SLAB_PANIC, NULL); } void avtab_cache_destroy(void) { kmem_cache_destroy(avtab_node_cachep); + kmem_cache_destroy(avtab_xperms_cachep); } diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index adb451cd44f9d3..d946c9dc3c9ca6 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -23,6 +23,7 @@ #ifndef _SS_AVTAB_H_ #define _SS_AVTAB_H_ +#include "security.h" #include struct avtab_key { @@ -37,13 +38,43 @@ struct avtab_key { #define AVTAB_MEMBER 0x0020 #define AVTAB_CHANGE 0x0040 #define AVTAB_TYPE (AVTAB_TRANSITION | AVTAB_MEMBER | AVTAB_CHANGE) +/* extended permissions */ +#define AVTAB_XPERMS_ALLOWED 0x0100 +#define AVTAB_XPERMS_AUDITALLOW 0x0200 +#define AVTAB_XPERMS_DONTAUDIT 0x0400 +#define AVTAB_XPERMS (AVTAB_XPERMS_ALLOWED | \ + AVTAB_XPERMS_AUDITALLOW | \ + AVTAB_XPERMS_DONTAUDIT) #define AVTAB_ENABLED_OLD 0x80000000 /* reserved for used in cond_avtab */ #define AVTAB_ENABLED 0x8000 /* reserved for used in cond_avtab */ u16 specified; /* what field is specified */ }; +/* + * For operations that require more than the 32 permissions provided by the avc + * extended permissions may be used to provide 256 bits of permissions. + */ +struct avtab_extended_perms { +/* These are not flags. All 256 values may be used */ +#define AVTAB_XPERMS_IOCTLFUNCTION 0x01 +#define AVTAB_XPERMS_IOCTLDRIVER 0x02 + /* extension of the avtab_key specified */ + u8 specified; /* ioctl, netfilter, ... */ + /* + * if 256 bits is not adequate as is often the case with ioctls, then + * multiple extended perms may be used and the driver field + * specifies which permissions are included. + */ + u8 driver; + /* 256 bits of permissions */ + struct extended_perms_data perms; +}; + struct avtab_datum { - u32 data; /* access vector or type value */ + union { + u32 data; /* access vector or type value */ + struct avtab_extended_perms *xperms; + } u; }; struct avtab_node { diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index 62c6773be0b75f..18643bf9894d5e 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -15,6 +15,7 @@ #include "security.h" #include "conditional.h" +#include "services.h" /* * cond_evaluate_expr evaluates a conditional expr @@ -612,21 +613,39 @@ int cond_write_list(struct policydb *p, struct cond_node *list, void *fp) return 0; } + +void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, + struct extended_perms_decision *xpermd) +{ + struct avtab_node *node; + + if (!ctab || !key || !xpermd) + return; + + for (node = avtab_search_node(ctab, key); node; + node = avtab_search_node_next(node, key->specified)) { + if (node->key.specified & AVTAB_ENABLED) + services_compute_xperms_decision(xpermd, node); + } + return; + +} /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ -void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd) +void cond_compute_av(struct avtab *ctab, struct avtab_key *key, + struct av_decision *avd, struct extended_perms *xperms) { struct avtab_node *node; - if (!ctab || !key || !avd) + if (!ctab || !key || !avd || !xperms) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED))) - avd->allowed |= node->datum.data; + avd->allowed |= node->datum.u.data; if ((u16)(AVTAB_AUDITDENY|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITDENY|AVTAB_ENABLED))) /* Since a '0' in an auditdeny mask represents a @@ -634,10 +653,13 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi * the '&' operand to ensure that all '0's in the mask * are retained (much unlike the allow and auditallow cases). */ - avd->auditdeny &= node->datum.data; + avd->auditdeny &= node->datum.u.data; if ((u16)(AVTAB_AUDITALLOW|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITALLOW|AVTAB_ENABLED))) - avd->auditallow |= node->datum.data; + avd->auditallow |= node->datum.u.data; + if ((node->key.specified & AVTAB_ENABLED) && + (node->key.specified & AVTAB_XPERMS)) + services_compute_xperms_drivers(xperms, node); } return; } diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 4d1f87466508f7..ddb43e7e1c756d 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -73,8 +73,10 @@ int cond_read_list(struct policydb *p, void *fp); int cond_write_bool(void *key, void *datum, void *ptr); int cond_write_list(struct policydb *p, struct cond_node *list, void *fp); -void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); - +void cond_compute_av(struct avtab *ctab, struct avtab_key *key, + struct av_decision *avd, struct extended_perms *xperms); +void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, + struct extended_perms_decision *xpermd); int evaluate_cond_node(struct policydb *p, struct cond_node *node); #endif /* _CONDITIONAL_H_ */ diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 74aa224267c11f..992a3153082587 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -148,6 +148,11 @@ static struct policydb_compat_info policydb_compat[] = { .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, + { + .version = POLICYDB_VERSION_XPERMS_IOCTL, + .sym_num = SYM_NUM, + .ocon_num = OCON_NUM, + }, }; static struct policydb_compat_info *policydb_lookup_compat(int version) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 9e2d8207091536..b7df12ba61d839 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -93,9 +93,10 @@ static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len); static void context_struct_compute_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - struct av_decision *avd); + struct context *tcontext, + u16 tclass, + struct av_decision *avd, + struct extended_perms *xperms); struct selinux_mapping { u16 value; /* policy value */ @@ -565,7 +566,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(&lo_scontext, tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -580,7 +582,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(scontext, &lo_tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -596,7 +599,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(&lo_scontext, &lo_tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -613,13 +617,39 @@ static void type_attribute_bounds_av(struct context *scontext, } /* - * Compute access vectors based on a context structure pair for - * the permissions in a particular class. + * flag which drivers have permissions + * only looking for ioctl based extended permssions + */ +void services_compute_xperms_drivers( + struct extended_perms *xperms, + struct avtab_node *node) +{ + unsigned int i; + + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + /* if one or more driver has all permissions allowed */ + for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) + xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; + } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + /* if allowing permissions within a driver */ + security_xperm_set(xperms->drivers.p, + node->datum.u.xperms->driver); + } + + /* If no ioctl commands are allowed, ignore auditallow and auditdeny */ + if (node->key.specified & AVTAB_XPERMS_ALLOWED) + xperms->len = 1; +} + +/* + * Compute access vectors and extended permissions based on a context + * structure pair for the permissions in a particular class. */ static void context_struct_compute_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - struct av_decision *avd) + struct context *tcontext, + u16 tclass, + struct av_decision *avd, + struct extended_perms *xperms) { struct constraint_node *constraint; struct role_allow *ra; @@ -633,6 +663,10 @@ static void context_struct_compute_av(struct context *scontext, avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; + if (xperms) { + memset(&xperms->drivers, 0, sizeof(xperms->drivers)); + xperms->len = 0; + } if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) { if (printk_ratelimit()) @@ -647,7 +681,7 @@ static void context_struct_compute_av(struct context *scontext, * this permission check, then use it. */ avkey.target_class = tclass; - avkey.specified = AVTAB_AV; + avkey.specified = AVTAB_AV | AVTAB_XPERMS; sattr = flex_array_get(policydb.type_attr_map_array, scontext->type - 1); BUG_ON(!sattr); tattr = flex_array_get(policydb.type_attr_map_array, tcontext->type - 1); @@ -660,15 +694,18 @@ static void context_struct_compute_av(struct context *scontext, node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) - avd->allowed |= node->datum.data; + avd->allowed |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITALLOW) - avd->auditallow |= node->datum.data; + avd->auditallow |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITDENY) - avd->auditdeny &= node->datum.data; + avd->auditdeny &= node->datum.u.data; + else if (xperms && (node->key.specified & AVTAB_XPERMS)) + services_compute_xperms_drivers(xperms, node); } /* Check conditional av table for additional permissions */ - cond_compute_av(&policydb.te_cond_avtab, &avkey, avd); + cond_compute_av(&policydb.te_cond_avtab, &avkey, + avd, xperms); } } @@ -899,6 +936,139 @@ static void avd_init(struct av_decision *avd) avd->flags = 0; } +void services_compute_xperms_decision(struct extended_perms_decision *xpermd, + struct avtab_node *node) +{ + unsigned int i; + + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + if (xpermd->driver != node->datum.u.xperms->driver) + return; + } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + if (!security_xperm_test(node->datum.u.xperms->perms.p, + xpermd->driver)) + return; + } else { + BUG(); + } + + if (node->key.specified == AVTAB_XPERMS_ALLOWED) { + xpermd->used |= XPERMS_ALLOWED; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->allowed->p, 0xff, + sizeof(xpermd->allowed->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++) + xpermd->allowed->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) { + xpermd->used |= XPERMS_AUDITALLOW; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->auditallow->p, 0xff, + sizeof(xpermd->auditallow->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++) + xpermd->auditallow->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) { + xpermd->used |= XPERMS_DONTAUDIT; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->dontaudit->p, 0xff, + sizeof(xpermd->dontaudit->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++) + xpermd->dontaudit->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else { + BUG(); + } +} + +void security_compute_xperms_decision(u32 ssid, + u32 tsid, + u16 orig_tclass, + u8 driver, + struct extended_perms_decision *xpermd) +{ + u16 tclass; + struct context *scontext, *tcontext; + struct avtab_key avkey; + struct avtab_node *node; + struct ebitmap *sattr, *tattr; + struct ebitmap_node *snode, *tnode; + unsigned int i, j; + + xpermd->driver = driver; + xpermd->used = 0; + memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); + memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); + memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); + + read_lock(&policy_rwlock); + if (!ss_initialized) + goto allow; + + scontext = sidtab_search(&sidtab, ssid); + if (!scontext) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n", + __func__, ssid); + goto out; + } + + tcontext = sidtab_search(&sidtab, tsid); + if (!tcontext) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n", + __func__, tsid); + goto out; + } + + tclass = unmap_class(orig_tclass); + if (unlikely(orig_tclass && !tclass)) { + if (policydb.allow_unknown) + goto allow; + goto out; + } + + + if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) { + pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass); + goto out; + } + + avkey.target_class = tclass; + avkey.specified = AVTAB_XPERMS; + sattr = flex_array_get(policydb.type_attr_map_array, + scontext->type - 1); + BUG_ON(!sattr); + tattr = flex_array_get(policydb.type_attr_map_array, + tcontext->type - 1); + BUG_ON(!tattr); + ebitmap_for_each_positive_bit(sattr, snode, i) { + ebitmap_for_each_positive_bit(tattr, tnode, j) { + avkey.source_type = i + 1; + avkey.target_type = j + 1; + for (node = avtab_search_node(&policydb.te_avtab, &avkey); + node; + node = avtab_search_node_next(node, avkey.specified)) + services_compute_xperms_decision(xpermd, node); + + cond_compute_xperms(&policydb.te_cond_avtab, + &avkey, xpermd); + } + } +out: + read_unlock(&policy_rwlock); + return; +allow: + memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); + goto out; +} /** * security_compute_av - Compute access vector decisions. @@ -906,6 +1076,7 @@ static void avd_init(struct av_decision *avd) * @tsid: target security identifier * @tclass: target security class * @avd: access vector decisions + * @xperms: extended permissions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. @@ -913,13 +1084,15 @@ static void avd_init(struct av_decision *avd) void security_compute_av(u32 ssid, u32 tsid, u16 orig_tclass, - struct av_decision *avd) + struct av_decision *avd, + struct extended_perms *xperms) { u16 tclass; struct context *scontext = NULL, *tcontext = NULL; read_lock(&policy_rwlock); avd_init(avd); + xperms->len = 0; if (!ss_initialized) goto allow; @@ -947,7 +1120,7 @@ void security_compute_av(u32 ssid, goto allow; goto out; } - context_struct_compute_av(scontext, tcontext, tclass, avd); + context_struct_compute_av(scontext, tcontext, tclass, avd, xperms); map_decision(orig_tclass, avd, policydb.allow_unknown); out: read_unlock(&policy_rwlock); @@ -993,7 +1166,7 @@ void security_compute_av_user(u32 ssid, goto out; } - context_struct_compute_av(scontext, tcontext, tclass, avd); + context_struct_compute_av(scontext, tcontext, tclass, avd, NULL); out: read_unlock(&policy_rwlock); return; @@ -1515,7 +1688,7 @@ static int security_compute_sid(u32 ssid, if (avdatum) { /* Use the type from the type transition/member/change rule. */ - newcontext.type = avdatum->data; + newcontext.type = avdatum->u.data; } /* if we have a objname this is a file trans check so check those rules */ diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h index e8d907e903cdb1..6abcd8729ec3a6 100644 --- a/security/selinux/ss/services.h +++ b/security/selinux/ss/services.h @@ -11,5 +11,11 @@ extern struct policydb policydb; +void services_compute_xperms_drivers(struct extended_perms *xperms, + struct avtab_node *node); + +void services_compute_xperms_decision(struct extended_perms_decision *xpermd, + struct avtab_node *node); + #endif /* _SS_SERVICES_H_ */ From 9629d04ae06812f217846b69728c969afee690b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 10 Jul 2015 17:19:56 -0400 Subject: [PATCH 020/734] selinux: reduce locking overhead in inode_free_security() The inode_free_security() function just took the superblock's isec_lock before checking and trying to remove the inode security struct from the linked list. In many cases, the list was empty and so the lock taking is wasteful as no useful work is done. On multi-socket systems with a large number of CPUs, there can also be a fair amount of spinlock contention on the isec_lock if many tasks are exiting at the same time. This patch changes the code to check the state of the list first before taking the lock and attempting to dequeue it. The list_del_init() can be called more than once on the same list with no harm as long as they are properly serialized. It should not be possible to have inode_free_security() called concurrently with list_add(). For better safety, however, we use list_empty_careful() here even though it is still not completely safe in case that happens. Signed-off-by: Waiman Long Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a049b72162707a..4de09f0227b4e5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -254,10 +254,21 @@ static void inode_free_security(struct inode *inode) struct inode_security_struct *isec = inode->i_security; struct superblock_security_struct *sbsec = inode->i_sb->s_security; - spin_lock(&sbsec->isec_lock); - if (!list_empty(&isec->list)) + /* + * As not all inode security structures are in a list, we check for + * empty list outside of the lock to make sure that we won't waste + * time taking a lock doing nothing. + * + * The list_del_init() function can be safely called more than once. + * It should not be possible for this function to be called with + * concurrent list_add(), but for better safety against future changes + * in the code, we use list_empty_careful() here. + */ + if (!list_empty_careful(&isec->list)) { + spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); - spin_unlock(&sbsec->isec_lock); + spin_unlock(&sbsec->isec_lock); + } /* * The inode may still be referenced in a path walk and From 5dee25d08eac01472904b0ab32ce35edee5c0518 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 10 Jul 2015 17:19:57 -0400 Subject: [PATCH 021/734] selinux: initialize sock security class to default value Initialize the security class of sock security structures to the generic socket class. This is similar to what is already done in inode_alloc_security for files. Generally the sclass field will later by set by socket_post_create or sk_clone or sock_graft, but for protocol implementations that fail to call any of these for newly accepted sockets, we want some sane default that will yield a legitimate avc denied message with non-garbage values for class and permission. Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4de09f0227b4e5..ef310f82717d77 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4559,6 +4559,7 @@ static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority sksec->peer_sid = SECINITSID_UNLABELED; sksec->sid = SECINITSID_UNLABELED; + sksec->sclass = SECCLASS_SOCKET; selinux_netlbl_sk_security_reset(sksec); sk->sk_security = sksec; From bd1741f4cf05d7709348f591d16eeb5f786de673 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 10 Jul 2015 17:19:57 -0400 Subject: [PATCH 022/734] selinux: Augment BUG_ON assertion for secclass_map. Ensure that we catch any cases where tclass == 0. Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 2d5e1b04cd5029..324acc62f7e074 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -116,6 +116,7 @@ static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) return; } + BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); perms = secclass_map[tclass-1].perms; audit_log_format(ab, " {"); @@ -164,7 +165,7 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla kfree(scontext); } - BUG_ON(tclass >= ARRAY_SIZE(secclass_map)); + BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name); } From c3c188b2c3ed29effe8693672ee1c84184103b4e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 Jul 2015 17:19:58 -0400 Subject: [PATCH 023/734] selinux: Create a common helper to determine an inode label [ver #3] Create a common helper function to determine the label for a new inode. This is then used by: - may_create() - selinux_dentry_init_security() - selinux_inode_init_security() This will change the behaviour of the functions slightly, bringing them all into line. Suggested-by: Stephen Smalley Signed-off-by: David Howells Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 87 +++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ef310f82717d77..f4be0a11078811 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1709,6 +1709,32 @@ static int file_has_perm(const struct cred *cred, return rc; } +/* + * Determine the label for an inode that might be unioned. + */ +static int selinux_determine_inode_label(const struct inode *dir, + const struct qstr *name, + u16 tclass, + u32 *_new_isid) +{ + const struct superblock_security_struct *sbsec = dir->i_sb->s_security; + const struct inode_security_struct *dsec = dir->i_security; + const struct task_security_struct *tsec = current_security(); + + if ((sbsec->flags & SE_SBINITIALIZED) && + (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { + *_new_isid = sbsec->mntpoint_sid; + } else if ((sbsec->flags & SBLABEL_MNT) && + tsec->create_sid) { + *_new_isid = tsec->create_sid; + } else { + return security_transition_sid(tsec->sid, dsec->sid, tclass, + name, _new_isid); + } + + return 0; +} + /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, @@ -1725,7 +1751,6 @@ static int may_create(struct inode *dir, sbsec = dir->i_sb->s_security; sid = tsec->sid; - newsid = tsec->create_sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; @@ -1736,12 +1761,10 @@ static int may_create(struct inode *dir, if (rc) return rc; - if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { - rc = security_transition_sid(sid, dsec->sid, tclass, - &dentry->d_name, &newsid); - if (rc) - return rc; - } + rc = selinux_determine_inode_label(dir, &dentry->d_name, tclass, + &newsid); + if (rc) + return rc; rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad); if (rc) @@ -2715,32 +2738,14 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, void **ctx, u32 *ctxlen) { - const struct cred *cred = current_cred(); - struct task_security_struct *tsec; - struct inode_security_struct *dsec; - struct superblock_security_struct *sbsec; - struct inode *dir = d_backing_inode(dentry->d_parent); u32 newsid; int rc; - tsec = cred->security; - dsec = dir->i_security; - sbsec = dir->i_sb->s_security; - - if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) { - newsid = tsec->create_sid; - } else { - rc = security_transition_sid(tsec->sid, dsec->sid, - inode_mode_to_security_class(mode), - name, - &newsid); - if (rc) { - printk(KERN_WARNING - "%s: security_transition_sid failed, rc=%d\n", - __func__, -rc); - return rc; - } - } + rc = selinux_determine_inode_label(d_inode(dentry->d_parent), name, + inode_mode_to_security_class(mode), + &newsid); + if (rc) + return rc; return security_sid_to_context(newsid, (char **)ctx, ctxlen); } @@ -2763,22 +2768,12 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, sid = tsec->sid; newsid = tsec->create_sid; - if ((sbsec->flags & SE_SBINITIALIZED) && - (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) - newsid = sbsec->mntpoint_sid; - else if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { - rc = security_transition_sid(sid, dsec->sid, - inode_mode_to_security_class(inode->i_mode), - qstr, &newsid); - if (rc) { - printk(KERN_WARNING "%s: " - "security_transition_sid failed, rc=%d (dev=%s " - "ino=%ld)\n", - __func__, - -rc, inode->i_sb->s_id, inode->i_ino); - return rc; - } - } + rc = selinux_determine_inode_label( + dir, qstr, + inode_mode_to_security_class(inode->i_mode), + &newsid); + if (rc) + return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { From fda4d578ed0a7e1d116f56a15efea0e4ba78acad Mon Sep 17 00:00:00 2001 From: Laurent Bigonville Date: Tue, 7 Jul 2015 23:10:52 +0200 Subject: [PATCH 024/734] selinux: explicitly declare the role "base_r" This fixes the compilation of policy generated by mdp with the recent version of checkpolicy. Signed-off-by: Laurent Bigonville Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- scripts/selinux/mdp/mdp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index 62b34ce1f50dd1..e10beb11b696e4 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -98,6 +98,7 @@ int main(int argc, char *argv[]) /* types, roles, and allows */ fprintf(fout, "type base_t;\n"); + fprintf(fout, "role base_r;\n"); fprintf(fout, "role base_r types { base_t };\n"); for (i = 0; secclass_map[i].name; i++) fprintf(fout, "allow base_t base_t:%s *;\n", From fa3eec7791b0fe27e3112804a71ba445ff336a6b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 1 Jul 2015 23:51:43 +0100 Subject: [PATCH 025/734] regmap: Silence warning on invalid zero length read Zero length reads make no sense in a regmap context and are likely to trigger bugs further down the stack so insert an error check, also silencing compiler warnings about use of ret in cases where we iterate per register. Reported-by: Russell King Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 8894b992043e04..9c1f856842a3ae 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2180,6 +2180,8 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, return -EINVAL; if (reg % map->reg_stride) return -EINVAL; + if (val_count == 0) + return -EINVAL; map->lock(map->lock_arg); From 8225d3853f34f6cf9caff15d8c385a528e0d7cb1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 21 Nov 2014 10:06:01 -0500 Subject: [PATCH 026/734] seccomp: Replace smp_read_barrier_depends() with lockless_dereference() Recently lockless_dereference() was added which can be used in place of hard-coding smp_read_barrier_depends(). The following PATCH makes the change. Signed-off-by: Pranith Kumar Signed-off-by: Kees Cook --- kernel/seccomp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4f44028943e663..980fd26da22e05 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sat, 13 Jun 2015 09:02:48 -0600 Subject: [PATCH 027/734] seccomp: add ptrace options for suspend/resume This patch is the first step in enabling checkpoint/restore of processes with seccomp enabled. One of the things CRIU does while dumping tasks is inject code into them via ptrace to collect information that is only available to the process itself. However, if we are in a seccomp mode where these processes are prohibited from making these syscalls, then what CRIU does kills the task. This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp filters to disable (and re-enable) seccomp filters for another task so that they can be successfully dumped (and restored). We restrict the set of processes that can disable seccomp through ptrace because although today ptrace can be used to bypass seccomp, there is some discussion of closing this loophole in the future and we would like this patch to not depend on that behavior and be future proofed for when it is removed. Note that seccomp can be suspended before any filters are actually installed; this behavior is useful on criu restore, so that we can suspend seccomp, restore the filters, unmap our restore code from the restored process' address space, and then resume the task by detaching and have the filters resumed as well. v2 changes: * require that the tracer have no seccomp filters installed * drop TIF_NOTSC manipulation from the patch * change from ptrace command to a ptrace option and use this ptrace option as the flag to check. This means that as soon as the tracer detaches/dies, seccomp is re-enabled and as a corrollary that one can not disable seccomp across PTRACE_ATTACHs. v3 changes: * get rid of various #ifdefs everywhere * report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly used v4 changes: * get rid of may_suspend_seccomp() in favor of a capable() check in ptrace directly v5 changes: * check that seccomp is not enabled (or suspended) on the tracer Signed-off-by: Tycho Andersen CC: Will Drewry CC: Roland McGrath CC: Pavel Emelyanov CC: Serge E. Hallyn Acked-by: Oleg Nesterov Acked-by: Andy Lutomirski [kees: access seccomp.mode through seccomp_mode() instead] Signed-off-by: Kees Cook --- include/linux/ptrace.h | 1 + include/uapi/linux/ptrace.h | 6 ++++-- kernel/ptrace.c | 13 +++++++++++++ kernel/seccomp.c | 8 ++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 987a73a40ef846..061265f9287676 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -34,6 +34,7 @@ #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) +#define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) /* single stepping state bits (used on ARM and PA-RISC) */ #define PT_SINGLESTEP_BIT 31 diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index cf1019e15f5bc5..a7a69798661440 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args { #define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP) /* eventless options */ -#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_SUSPEND_SECCOMP (1 << 21) -#define PTRACE_O_MASK (0x000000ff | PTRACE_O_EXITKILL) +#define PTRACE_O_MASK (\ + 0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP) #include diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36afb..787320de68e024 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 980fd26da22e05..645e42d6fa4d2e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -590,6 +590,10 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + if (mode == 0) return; else if (mode == SECCOMP_MODE_STRICT) @@ -691,6 +695,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ From 221272f97ca528048a577a3ff23d7774286ca5fd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2015 15:29:16 -0700 Subject: [PATCH 028/734] seccomp: swap hard-coded zeros to defined name For clarity, if CONFIG_SECCOMP isn't defined, seccomp_mode() is returning "disabled". This makes that more clear, along with another 0-use, and results in no operational change. Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- kernel/seccomp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index a19ddacdac30ae..f4265039a94c8f 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -78,7 +78,7 @@ static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3) static inline int seccomp_mode(struct seccomp *s) { - return 0; + return SECCOMP_MODE_DISABLED; } #endif /* CONFIG_SECCOMP */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 645e42d6fa4d2e..383bd6caca815d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -594,7 +594,7 @@ void secure_computing_strict(int this_syscall) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; - if (mode == 0) + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); From 2de9d6006c190bb0f706e8404de94cd94293801f Mon Sep 17 00:00:00 2001 From: Nariman Poushin Date: Thu, 16 Jul 2015 16:36:22 +0100 Subject: [PATCH 029/734] regmap: Apply optional delay in multi_reg_write/register_patch Add an optional delay_us field in reg_sequence to allow the client to specify a delay (in microseconds) to be applied after any given write in a sequence of writes. We treat a delay in a sequence the same way we treat a page change as they are logically similar in that you can coalesce all write before a delay (in the same way you can coalesce all writes before a page change is needed) Signed-off-by: Nariman Poushin Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 54 ++++++++++++++++++++++++++++++++---- include/linux/regmap.h | 5 +++- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 2cbb4502747d83..b3a5aa5cd580c0 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -18,6 +18,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1807,10 +1808,12 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map, int i, n; struct reg_sequence *base; unsigned int this_page = 0; + unsigned int page_change = 0; /* * the set of registers are not neccessarily in order, but * since the order of write must be preserved this algorithm - * chops the set each time the page changes + * chops the set each time the page changes. This also applies + * if there is a delay required at any point in the sequence. */ base = regs; for (i = 0, n = 0; i < num_regs; i++, n++) { @@ -1826,16 +1829,48 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map, this_page = win_page; if (win_page != this_page) { this_page = win_page; + page_change = 1; + } + } + + /* If we have both a page change and a delay make sure to + * write the regs and apply the delay before we change the + * page. + */ + + if (page_change || regs[i].delay_us) { + + /* For situations where the first write requires + * a delay we need to make sure we don't call + * raw_multi_reg_write with n=0 + * This can't occur with page breaks as we + * never write on the first iteration + */ + if (regs[i].delay_us && i == 0) + n = 1; + ret = _regmap_raw_multi_reg_write(map, base, n); if (ret != 0) return ret; + + if (regs[i].delay_us) + udelay(regs[i].delay_us); + base += n; n = 0; - } - ret = _regmap_select_page(map, &base[n].reg, range, 1); - if (ret != 0) - return ret; + + if (page_change) { + ret = _regmap_select_page(map, + &base[n].reg, + range, 1); + if (ret != 0) + return ret; + + page_change = 0; + } + } + } if (n > 0) return _regmap_raw_multi_reg_write(map, base, n); @@ -1854,6 +1889,9 @@ static int _regmap_multi_reg_write(struct regmap *map, ret = _regmap_write(map, regs[i].reg, regs[i].def); if (ret != 0) return ret; + + if (regs[i].delay_us) + udelay(regs[i].delay_us); } return 0; } @@ -1893,8 +1931,12 @@ static int _regmap_multi_reg_write(struct regmap *map, for (i = 0; i < num_regs; i++) { unsigned int reg = regs[i].reg; struct regmap_range_node *range; + + /* Coalesce all the writes between a page break or a delay + * in a sequence + */ range = _regmap_range_lookup(map, reg); - if (range) { + if (range || regs[i].delay_us) { size_t len = sizeof(struct reg_sequence)*num_regs; struct reg_sequence *base = kmemdup(regs, len, GFP_KERNEL); diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c9ef2ec69142e2..5a7cf2136c81fd 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -51,14 +51,17 @@ struct reg_default { }; /** - * Register/value pairs for sequences of writes + * Register/value pairs for sequences of writes with an optional delay in + * microseconds to be applied after each write. * * @reg: Register address. * @def: Register value. + * @delay_us: Delay to be applied after the register write in microseconds */ struct reg_sequence { unsigned int reg; unsigned int def; + unsigned int delay_us; }; #ifdef CONFIG_REGMAP From b2c0b2cbb282f0cf42518ffacbe197e6f2884168 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2014 23:57:13 +0100 Subject: [PATCH 030/734] nmi: create generic NMI backtrace implementation x86s NMI backtrace implementation (for arch_trigger_all_cpu_backtrace()) is fairly generic in nature - the only architecture specific bits are the act of raising the NMI to other CPUs, and reporting the status of the NMI handler. These are fairly simple to factor out, and produce a generic implementation which can be shared between ARM and x86. Reviewed-by: Thomas Gleixner Signed-off-by: Russell King --- include/linux/nmi.h | 6 ++ lib/Makefile | 2 +- lib/nmi_backtrace.c | 162 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 lib/nmi_backtrace.c diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f94da0e65dea90..5791e3229068f7 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -49,6 +49,12 @@ static inline bool trigger_allbutself_cpu_backtrace(void) arch_trigger_all_cpu_backtrace(false); return true; } + +/* generic implementation */ +void nmi_trigger_all_cpu_backtrace(bool include_self, + void (*raise)(cpumask_t *mask)); +bool nmi_cpu_backtrace(struct pt_regs *regs); + #else static inline bool trigger_all_cpu_backtrace(void) { diff --git a/lib/Makefile b/lib/Makefile index 6897b527581a8d..392169c5bc4eda 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o + earlycpio.o seq_buf.o nmi_backtrace.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c new file mode 100644 index 00000000000000..88d3d32e59236b --- /dev/null +++ b/lib/nmi_backtrace.c @@ -0,0 +1,162 @@ +/* + * NMI backtrace support + * + * Gratuitously copied from arch/x86/kernel/apic/hw_nmi.c by Russell King, + * with the following header: + * + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + */ +#include +#include +#include +#include +#include + +#ifdef arch_trigger_all_cpu_backtrace +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +static cpumask_t printtrace_mask; + +#define NMI_BUF_SIZE 4096 + +struct nmi_seq_buf { + unsigned char buffer[NMI_BUF_SIZE]; + struct seq_buf seq; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; + +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +void nmi_trigger_all_cpu_backtrace(bool include_self, + void (*raise)(cpumask_t *mask)) +{ + struct nmi_seq_buf *s; + int i, cpu, this_cpu = get_cpu(); + + if (test_and_set_bit(0, &backtrace_flag)) { + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + put_cpu(); + return; + } + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + + cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); + + /* + * Set up per_cpu seq_buf buffers that the NMIs running on the other + * CPUs will write to. + */ + for_each_cpu(cpu, to_cpumask(backtrace_mask)) { + s = &per_cpu(nmi_print_seq, cpu); + seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); + } + + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("Sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + raise(to_cpumask(backtrace_mask)); + } + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + touch_softlockup_watchdog(); + } + + /* + * Now that all the NMIs have triggered, we can dump out their + * back traces safely to the console. + */ + for_each_cpu(cpu, &printtrace_mask) { + int len, last_i = 0; + + s = &per_cpu(nmi_print_seq, cpu); + len = seq_buf_used(&s->seq); + if (!len) + continue; + + /* Print line by line. */ + for (i = 0; i < len; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < len) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + } + + clear_bit(0, &backtrace_flag); + smp_mb__after_atomic(); + put_cpu(); +} + +/* + * It is not safe to call printk() directly from NMI handlers. + * It may be fine if the NMI detected a lock up and we have no choice + * but to do so, but doing a NMI on all other CPUs to get a back trace + * can be done with a sysrq-l. We don't want that to lock up, which + * can happen if the NMI interrupts a printk in progress. + * + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes + * the content into a per cpu seq_buf buffer. Then when the NMIs are + * all done, we can safely dump the contents of the seq_buf to a printk() + * from a non NMI context. + */ +static int nmi_vprintk(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + unsigned int len = seq_buf_used(&s->seq); + + seq_buf_vprintf(&s->seq, fmt, args); + return seq_buf_used(&s->seq) - len; +} + +bool nmi_cpu_backtrace(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + printk_func_t printk_func_save = this_cpu_read(printk_func); + + /* Replace printk to write into the NMI seq */ + this_cpu_write(printk_func, nmi_vprintk); + pr_warn("NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + this_cpu_write(printk_func, printk_func_save); + + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return true; + } + + return false; +} +NOKPROBE_SYMBOL(nmi_cpu_backtrace); +#endif From 4d7489ffba0aef4d2c708b6ff1428efd6ccf41df Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 10 Jul 2015 21:47:36 +0100 Subject: [PATCH 031/734] nmi: x86: convert to generic nmi handler Convert x86 to use the generic nmi handler code which can be shared between architectures. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: Russell King --- arch/x86/kernel/apic/hw_nmi.c | 133 +--------------------------------- 1 file changed, 4 insertions(+), 129 deletions(-) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00ab..045e424fb3680f 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); - -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ -static unsigned long backtrace_flag; - -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); + apic->send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; - int this_cpu = get_cpu(); - - if (test_and_set_bit(0, &backtrace_flag)) { - /* - * If there is already a trigger_all_cpu_backtrace() in progress - * (backtrace_flag == 1), don't output double cpu dump infos. - */ - put_cpu(); - return; - } - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) - cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - - if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); - apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); - } - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - touch_softlockup_watchdog(); - } - - /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. - */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); - put_cpu(); -} - -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; + nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); } static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - int cpu; - - cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - this_cpu_write(printk_func, printk_func_save); - - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; - } return NMI_DONE; } From 96f0e00378d4a1fc1b79933ef84e1595015de808 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2014 23:57:13 +0100 Subject: [PATCH 032/734] ARM: add basic support for on-demand backtrace of other CPUs As we now have generic infrastructure to support backtracing of other CPUs in the system on lockups, we can start to implement this for ARM. Initially, we add an IPI based implementation, as the GIC code needs modification to support the generation of FIQ IPIs, and not all ARM platforms have the ability to raise a FIQ in the non-secure world. This provides us with a "best efforts" implementation in the absence of FIQs. Signed-off-by: Russell King --- arch/arm/include/asm/irq.h | 5 +++++ arch/arm/kernel/smp.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 53c15dec7af6aa..be1d07d59ee978 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -35,6 +35,11 @@ extern void (*handle_arch_irq)(struct pt_regs *); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); #endif +#ifdef CONFIG_SMP +extern void arch_trigger_all_cpu_backtrace(bool); +#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x) +#endif + #endif #endif diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 90dfbedfbfb852..3a20c386fd33f7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + IPI_CPU_BACKTRACE = 15, }; static DECLARE_COMPLETION(cpu_running); @@ -630,6 +632,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) irq_exit(); break; + case IPI_CPU_BACKTRACE: + irq_enter(); + nmi_cpu_backtrace(regs); + irq_exit(); + break; + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -724,3 +732,13 @@ static int __init register_cpufreq_notifier(void) core_initcall(register_cpufreq_notifier); #endif + +static void raise_nmi(cpumask_t *mask) +{ + smp_cross_call(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_all_cpu_backtrace(bool include_self) +{ + nmi_trigger_all_cpu_backtrace(include_self, raise_nmi); +} From 0642ef6f2992eba46c41abb5ceb7d4fa14ba888e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Jun 2015 14:32:54 +0100 Subject: [PATCH 033/734] debugfs: Export bool read/write functions The file read/write functions for bools have no special dependencies on debugfs internals and are sufficiently non-trivial to be worth exporting so clients can re-use the implementation. Signed-off-by: Richard Fitzgerald Acked-by: Greg Kroah-Hartman Signed-off-by: Mark Brown --- fs/debugfs/file.c | 14 ++++++++------ include/linux/debugfs.h | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa0028b8d..6c55ade071c39d 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 420311bcee38c2..9beb636b97ebcd 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -116,6 +116,12 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, bool debugfs_initialized(void); +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos); + #else #include @@ -282,6 +288,20 @@ static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev, return ERR_PTR(-ENODEV); } +static inline ssize_t debugfs_read_file_bool(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + return -ENODEV; +} + +static inline ssize_t debugfs_write_file_bool(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + return -ENODEV; +} + #endif #endif From d3dc5430d68fb91a62d971648170b34d46ab85bc Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Jun 2015 14:32:55 +0100 Subject: [PATCH 034/734] regmap: debugfs: Allow writes to cache state settings Allow the user to write the cache_only and cache_bypass settings. This can be useful for debugging. Since this can lead to the hardware getting out-of-sync with the cache, at least for the period that the cache state is forced, the kernel is tainted and the action is recorded in the kernel log. When disabling cache_only through debugfs a cache sync will be performed. Signed-off-by: Richard Fitzgerald Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-debugfs.c | 90 ++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 5799a0b9e6cc41..6a61e4fa73a28a 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -469,6 +469,87 @@ static const struct file_operations regmap_access_fops = { .llseek = default_llseek, }; +static ssize_t regmap_cache_only_write_file(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct regmap *map = container_of(file->private_data, + struct regmap, cache_only); + ssize_t result; + bool was_enabled, require_sync = false; + int err; + + map->lock(map->lock_arg); + + was_enabled = map->cache_only; + + result = debugfs_write_file_bool(file, user_buf, count, ppos); + if (result < 0) { + map->unlock(map->lock_arg); + return result; + } + + if (map->cache_only && !was_enabled) { + dev_warn(map->dev, "debugfs cache_only=Y forced\n"); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } else if (!map->cache_only && was_enabled) { + dev_warn(map->dev, "debugfs cache_only=N forced: syncing cache\n"); + require_sync = true; + } + + map->unlock(map->lock_arg); + + if (require_sync) { + err = regcache_sync(map); + if (err) + dev_err(map->dev, "Failed to sync cache %d\n", err); + } + + return result; +} + +static const struct file_operations regmap_cache_only_fops = { + .open = simple_open, + .read = debugfs_read_file_bool, + .write = regmap_cache_only_write_file, +}; + +static ssize_t regmap_cache_bypass_write_file(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct regmap *map = container_of(file->private_data, + struct regmap, cache_bypass); + ssize_t result; + bool was_enabled; + + map->lock(map->lock_arg); + + was_enabled = map->cache_bypass; + + result = debugfs_write_file_bool(file, user_buf, count, ppos); + if (result < 0) + goto out; + + if (map->cache_bypass && !was_enabled) { + dev_warn(map->dev, "debugfs cache_bypass=Y forced\n"); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } else if (!map->cache_bypass && was_enabled) { + dev_warn(map->dev, "debugfs cache_bypass=N forced\n"); + } + +out: + map->unlock(map->lock_arg); + + return result; +} + +static const struct file_operations regmap_cache_bypass_fops = { + .open = simple_open, + .read = debugfs_read_file_bool, + .write = regmap_cache_bypass_write_file, +}; + void regmap_debugfs_init(struct regmap *map, const char *name) { struct rb_node *next; @@ -530,12 +611,13 @@ void regmap_debugfs_init(struct regmap *map, const char *name) } if (map->cache_type) { - debugfs_create_bool("cache_only", 0400, map->debugfs, - &map->cache_only); + debugfs_create_file("cache_only", 0600, map->debugfs, + &map->cache_only, ®map_cache_only_fops); debugfs_create_bool("cache_dirty", 0400, map->debugfs, &map->cache_dirty); - debugfs_create_bool("cache_bypass", 0400, map->debugfs, - &map->cache_bypass); + debugfs_create_file("cache_bypass", 0600, map->debugfs, + &map->cache_bypass, + ®map_cache_bypass_fops); } next = rb_first(&map->range_tree); From 9fe6b778ca93e6171dbb8e54df557a278a91abea Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:34 +0300 Subject: [PATCH 035/734] tracing: Prefer kcalloc over kzalloc with multiply Use kcalloc for allocating an array instead of kzalloc with multiply, as that is what kcalloc is used for. Found with checkpatch. Link: http://lkml.kernel.org/r/1433835155-6894-2-git-send-email-gilf@ezchip.com Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff874364..5d219384b4d187 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; From 72917235fd5f08638be1d52dcdb0fee3ce2cc95f Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:35 +0300 Subject: [PATCH 036/734] tracing: Fix for non-continuous cpu ids Currently exception occures due to access beyond buffer_iter range while using index of cpu bigger than num_possible_cpus(). Below there is an example for such exception when we use cpus 0,1,16,17. In order to fix buffer allocation size for non-continuous cpu ids we allocate according to the max cpu id and not according to the amount of possible cpus. Example: $ cat /sys/kernel/debug/tracing/per_cpu/cpu1/trace Path: /bin/busybox CPU: 0 PID: 82 Comm: cat Not tainted 4.0.0 #29 task: 80734c80 ti: 80012000 task.ti: 80012000 [ECR ]: 0x00220100 => Invalid Read @ 0x00000000 by insn @ 0x800abafc [EFA ]: 0x00000000 [BLINK ]: ring_buffer_read_finish+0x24/0x64 [ERET ]: rb_check_pages+0x20/0x188 [STAT32]: 0x00001a00 : BTA: 0x800abafc SP: 0x80013f0c FP: 0x57719cf8 LPS: 0x200036b4 LPE: 0x200036b8 LPC: 0x00000000 r00: 0x8002aca0 r01: 0x00001606 r02: 0x00000000 r03: 0x00000001 r04: 0x00000000 r05: 0x804b4954 r06: 0x00030003 r07: 0x8002a260 r08: 0x00000286 r09: 0x00080002 r10: 0x00001006 r11: 0x807351a4 r12: 0x00000001 Stack Trace: rb_check_pages+0x20/0x188 ring_buffer_read_finish+0x24/0x64 tracing_release+0x4e/0x170 __fput+0x62/0x158 task_work_run+0xa2/0xd4 do_notify_resume+0x52/0x7c resume_user_mode_begin+0xdc/0xe0 Link: http://lkml.kernel.org/r/1433835155-6894-3-git-send-email-gilf@ezchip.com Signed-off-by: Noam Camus Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d219384b4d187..59814adc39d6b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; From 5e2d5ef8ec1e3854daec41a3697a8d2ce05ff2ef Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:55:06 +0530 Subject: [PATCH 037/734] ftrace: correct the counter increment for trace_buffer data In ftrace_dump, for disabling buffer, iter.tr->trace_buffer.data is used. But for enabling, iter.trace_buffer->data is used. Even though, both point to same buffer, for readability, same convention should be used. Link: http://lkml.kernel.org/r/1434972306-20043-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59814adc39d6b4..6e79408674aaa1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; From fcc742eaad7cbcbbb2a96edc8f1d22adbaa804cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:13:14 -0400 Subject: [PATCH 038/734] ring-buffer: Add event descriptor to simplify passing data Add rb_event_info descriptor to pass event info to functions a bit easier than using a bunch of parameters. This will also allow for changing the code around a bit to find better fast paths. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 91 ++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3c6..ba8f25ffcf6f12 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -399,6 +399,17 @@ struct rb_irq_work { bool wakeup_full; }; +/* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + /* * Used for which event context the event is in. * NMI = 0 @@ -2000,9 +2011,12 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) + struct ring_buffer_event *event, + struct rb_event_info *info) { + unsigned length = info->length; + u64 delta = info->delta; + /* Only a commit updates the timestamp */ if (unlikely(!rb_event_is_commit(cpu_buffer, event))) delta = 0; @@ -2011,7 +2025,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * If we need to add a timestamp, then we * add it to the start of the resevered space. */ - if (unlikely(add_timestamp)) { + if (unlikely(info->add_timestamp)) { event = rb_add_time_stamp(event, delta); length -= RB_LEN_TIME_EXTEND; delta = 0; @@ -2203,10 +2217,11 @@ static unsigned rb_calculate_event_length(unsigned length) static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2291,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,25 +2384,24 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) + struct rb_event_info *info) { - struct buffer_page *tail_page; struct ring_buffer_event *event; + struct buffer_page *tail_page; unsigned long tail, write; /* @@ -2394,33 +2409,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * hold in the time field of the event, then we append a * TIME EXTEND event ahead of the data event. */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; - tail = write - length; + tail = write - info->length; /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (!tail) - delta = 0; + info->delta = 0; /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); @@ -2429,10 +2443,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = ts; + tail_page->page->time_stamp = info->ts; /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + local_add(info->length, &cpu_buffer->entries_bytes); return event; } @@ -2521,9 +2535,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + struct rb_event_info info; int nr_loops = 0; - int add_timestamp; u64 diff; rb_start_commit(cpu_buffer); @@ -2543,10 +2556,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, } #endif - length = rb_calculate_event_length(length); + info.length = rb_calculate_event_length(length); again: - add_timestamp = 0; - delta = 0; + info.add_timestamp = 0; + info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. @@ -2560,35 +2573,35 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK local_clock_stable = sched_clock_stable(); #endif - WARN_ONCE(delta > (1ULL << 59), + WARN_ONCE(info.delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, + (unsigned long long)info.delta, + (unsigned long long)info.ts, (unsigned long long)cpu_buffer->write_stamp, local_clock_stable ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; + info.add_timestamp = 1; } } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); + event = __rb_reserve_next(cpu_buffer, &info); + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; From 9826b2733a4399149072058a11f611357479229d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:36:45 -0400 Subject: [PATCH 039/734] ring-buffer: Move the adding of the extended timestamp out of line Requiring a extended time stamp is an uncommon occurrence, and it is best to do it out of line when needed. Add a noinline function that handles the extended timestamp and have it called with an unlikely to completely move it out of the fast path. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba8f25ffcf6f12..a78d4ee4bc584c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2396,6 +2396,29 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -2582,22 +2605,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* Did the write stamp get updated already? */ if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(info.delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info.delta, - (unsigned long long)info.ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info.add_timestamp = 1; - } + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); } event = __rb_reserve_next(cpu_buffer, &info); From a4543a2fa9ef31d6d0f854a4e14f8f82e7996d8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 09:40:18 -0400 Subject: [PATCH 040/734] ring-buffer: Get timestamp after event is allocated Move the capturing of the timestamp to after an event is allocated. If the event is not a commit (where it is an event that preempted another event), then no timestamp is needed, because the delta of nested events is always zero. If the event starts on a new page, no delta needs to be calculated as the full timestamp will be added to the page header, and the event will have a delta of zero. Now if the event requires a time extend (the delta does not fit in the 27 bit delta slot in the header), then the event is discarded, the length is extended to hold the TIME_EXTEND event that allows for a 59 bit delta, and the commit is tried again. If the event can't be discarded (another event came in after it), then the TIME_EXTEND is added directly to the allocated event and the rest of the event is given padding. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 161 ++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 47 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a78d4ee4bc584c..b5ed553e0a45f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2009,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) * and with this, we can determine what to place into the * data field. */ -static void +static void __always_inline rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) @@ -2017,10 +2017,6 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the resevered space. @@ -2286,6 +2282,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2300,6 +2298,16 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, int ret; u64 ts; + /* + * If the event had a timestamp attached to it, remove it. + * The first event on a page (nested or not) always uses + * the full timestamp of the new page. + */ + if (info->add_timestamp) { + info->add_timestamp = 0; + info->length -= RB_LEN_TIME_EXTEND; + } + next_page = tail_page; rb_inc_page(cpu_buffer, &next_page); @@ -2386,6 +2394,11 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now to update timestamps */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2403,10 +2416,23 @@ static inline bool sched_clock_stable(void) } #endif +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); +static inline void rb_event_discard(struct ring_buffer_event *event); +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + static noinline void rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, struct rb_event_info *info) { + struct ring_buffer_event *padding; + int length; + int size; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, @@ -2416,7 +2442,61 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info->add_timestamp = 1; + + /* + * Discarding this event to add a timestamp in front, but + * we still need to update the length of it to perform the discard. + */ + rb_update_event(cpu_buffer, event, info); + + if (rb_try_to_discard(cpu_buffer, event)) { + info->add_timestamp = 1; + /* + * The time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + info->length += RB_LEN_TIME_EXTEND; + return; + } + + /* + * Humpf! An event came in after this one, and because it is not a + * commit, it will have a delta of zero, thus, it will take on + * the timestamp of the previous commit, which happened a long time + * ago (we need to add a timestamp, remember?). + * We need to add the timestamp here. A timestamp is a fixed size + * of 8 bytes. That means the rest of the event needs to be + * padding. + */ + size = info->length - RB_LEN_TIME_EXTEND; + + /* The padding will have a delta of 1 */ + if (size) + info->delta--; + + padding = rb_add_time_stamp(event, info->delta); + + if (size) { + length = info->length; + info->delta = 0; + info->length = size; + rb_update_event(cpu_buffer, padding, info); + + rb_event_discard(padding); + + /* Still visible, need to update write_stamp */ + rb_update_write_stamp(cpu_buffer, event); + + /* Still need to commit the padding. */ + rb_end_commit(cpu_buffer); + + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + + /* The next iteration still uses the original length */ + info->length = length; + } } static struct ring_buffer_event * @@ -2426,14 +2506,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; + bool is_commit; tail_page = info->tail_page = cpu_buffer->tail_page; write = local_add_return(info->length, &tail_page->write); @@ -2442,31 +2515,42 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - info->length; - /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. - */ - if (!tail) - info->delta = 0; - /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, info); - - local_inc(&tail_page->entries); /* - * If this is the first commit on the page, then update - * its timestamp. + * If this is the first commit on the page, then it has the same + * timestamp as the page itself, otherwise we need to figure out + * the delta. */ - if (!tail) + info->ts = rb_time_stamp(cpu_buffer->buffer); + is_commit = rb_event_is_commit(cpu_buffer, event); + + /* Commits are special (non nested events) */ + info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0; + + if (!tail) { + /* + * If this is the first commit on the page, set the + * page to its timestamp. + */ tail_page->page->time_stamp = info->ts; + info->delta = 0; + + } else if (unlikely(test_time_stamp(info->delta)) && + !info->add_timestamp) { + rb_handle_timestamp(cpu_buffer, event, info); + return ERR_PTR(-EAGAIN); + } + + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); @@ -2560,7 +2644,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; rb_start_commit(cpu_buffer); @@ -2578,12 +2661,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } #endif - info.length = rb_calculate_event_length(length); - again: info.add_timestamp = 0; - info.delta = 0; - + again: /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2596,19 +2676,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) From 7d75e6833b579adb3de2c7b917de1204eeafea47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 10:29:10 -0400 Subject: [PATCH 041/734] ring-buffer: Make sure event has enough room for extend and padding Now that events only add time extends after it is committed, in case an event comes in before it can discard the allocated event, the time extend needs to be stored within the event. If the event is bigger than then size needed for the time extend, padding must be added. The minimum padding size is 8 bytes. Thus if the event is 12 bytes (size of time extend + 4), there will not be enough room to add both the time extend and padding. Make sure all events are either 8 bytes or 16 or more bytes. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b5ed553e0a45f9..781ce359976c97 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2208,6 +2208,21 @@ static unsigned rb_calculate_event_length(unsigned length) length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + return length; } From d90fd77402d3de56a9ca3df04e5d868d0979dc59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 12:12:27 -0400 Subject: [PATCH 042/734] ring-buffer: Reorganize function locations Functions in ring-buffer.c have gotten interleaved between different use cases. Move the functions around to get like functions closer together. This may or may not help gcc keep cache locality, but it makes it a little easier to work with the code. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 920 ++++++++++++++++++------------------- 1 file changed, 456 insertions(+), 464 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 781ce359976c97..1cce0fbf92cea0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1979,63 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void __always_inline -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, - struct rb_event_info *info) -{ - unsigned length = info->length; - u64 delta = info->delta; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2194,38 +2070,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - /* - * In case the time delta is larger than the 27 bits for it - * in the header, we need to add a timestamp. If another - * event comes in when trying to discard this one to increase - * the length, then the timestamp will be added in the allocated - * space of this event. If length is bigger than the size needed - * for the TIME_EXTEND, then padding has to be used. The events - * length must be either RB_LEN_TIME_EXTEND, or greater than or equal - * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. - * As length is a multiple of 4, we only need to worry if it - * is 12 (RB_LEN_TIME_EXTEND + 4). - */ - if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) - length += RB_ALIGNMENT; - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) @@ -2424,71 +2268,471 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - return true; -} -#endif + event->type_len = RINGBUF_TYPE_TIME_EXTEND; -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); -static inline void rb_event_discard(struct ring_buffer_event *event); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } -static noinline void -rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, - struct rb_event_info *info) -{ - struct ring_buffer_event *padding; - int length; - int size; + return skip_time_extend(event); +} - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info->delta, - (unsigned long long)info->ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable() ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void __always_inline +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; /* - * Discarding this event to add a timestamp in front, but - * we still need to update the length of it to perform the discard. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - rb_update_event(cpu_buffer, event, info); - - if (rb_try_to_discard(cpu_buffer, event)) { - info->add_timestamp = 1; - /* - * The time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - info->length += RB_LEN_TIME_EXTEND; - return; + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } - /* - * Humpf! An event came in after this one, and because it is not a - * commit, it will have a delta of zero, thus, it will take on - * the timestamp of the previous commit, which happened a long time - * ago (we need to add a timestamp, remember?). - * We need to add the timestamp here. A timestamp is a fixed size - * of 8 bytes. That means the rest of the event needs to be - * padding. - */ - size = info->length - RB_LEN_TIME_EXTEND; - - /* The padding will have a delta of 1 */ - if (size) - info->delta--; + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length++; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + + return length; +} + +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); + return 1; + } + } + + /* could not discard */ + return 0; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} + +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; + + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ + +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned int val = cpu_buffer->current_context; + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = RB_CTX_NMI; + else if (in_irq()) + bit = RB_CTX_IRQ; + else + bit = RB_CTX_SOFTIRQ; + } else + bit = RB_CTX_NORMAL; + + if (unlikely(val & (1 << bit))) + return 1; + + val |= (1 << bit); + cpu_buffer->current_context = val; + + return 0; +} + +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->current_context &= cpu_buffer->current_context - 1; +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + struct ring_buffer_event *padding; + int length; + int size; + + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + + /* + * Discarding this event to add a timestamp in front, but + * we still need to update the length of it to perform the discard. + */ + rb_update_event(cpu_buffer, event, info); + + if (rb_try_to_discard(cpu_buffer, event)) { + info->add_timestamp = 1; + /* + * The time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + info->length += RB_LEN_TIME_EXTEND; + return; + } + + /* + * Humpf! An event came in after this one, and because it is not a + * commit, it will have a delta of zero, thus, it will take on + * the timestamp of the previous commit, which happened a long time + * ago (we need to add a timestamp, remember?). + * We need to add the timestamp here. A timestamp is a fixed size + * of 8 bytes. That means the rest of the event needs to be + * padding. + */ + size = info->length - RB_LEN_TIME_EXTEND; + + /* The padding will have a delta of 1 */ + if (size) + info->delta--; padding = rb_add_time_stamp(event, info->delta); @@ -2573,84 +2817,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; - - new_index = rb_event_index(event); - old_index = new_index + rb_event_ts_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - unsigned long write_mask = - local_read(&bpage->write) & ~RB_WRITE_MASK; - unsigned long event_length = rb_event_length(event); - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - old_index += write_mask; - new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { - /* update counters */ - local_sub(event_length, &cpu_buffer->entries_bytes); - return 1; - } - } - - /* could not discard */ - return 0; -} - -static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - local_inc(&cpu_buffer->committing); - local_inc(&cpu_buffer->commits); -} - -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long commits; - - if (RB_WARN_ON(cpu_buffer, - !local_read(&cpu_buffer->committing))) - return; - - again: - commits = local_read(&cpu_buffer->commits); - /* synchronize with interrupts */ - barrier(); - if (local_read(&cpu_buffer->committing) == 1) - rb_set_commit_to_write(cpu_buffer); - - local_dec(&cpu_buffer->committing); - - /* synchronize with interrupts */ - barrier(); - - /* - * Need to account for interrupts coming in between the - * updating of the commit page and the clearing of the - * committing counter. - */ - if (unlikely(local_read(&cpu_buffer->commits) != commits) && - !local_read(&cpu_buffer->committing)) { - local_inc(&cpu_buffer->committing); - goto again; - } -} - static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, @@ -2706,75 +2872,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } -/* - * The lock and unlock are done within a preempt disable section. - * The current_context per_cpu variable can only be modified - * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. To pass this - * information from the lock to the unlock without having to - * access the 'in_interrupt()' functions again (which do show - * a bit of overhead in something as critical as function tracing, - * we use a bitmask trick. - * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. - * - * This works because this is the order of contexts that can - * preempt other contexts. A SoftIRQ never preempts an IRQ - * context. - * - * When the context is determined, the corresponding bit is - * checked and set (if it was set, then a recursion of that context - * happened). - * - * On unlock, we need to clear this bit. To do so, just subtract - * 1 from the current_context and AND it to itself. - * - * (binary) - * 101 - 1 = 100 - * 101 & 100 = 100 (clearing bit zero) - * - * 1010 - 1 = 1001 - * 1010 & 1001 = 1000 (clearing bit 1) - * - * The least significant bit can be cleared this way, and it - * just so happens that it is the same bit corresponding to - * the current context. - */ - -static __always_inline int -trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned int val = cpu_buffer->current_context; - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = RB_CTX_NMI; - else if (in_irq()) - bit = RB_CTX_IRQ; - else - bit = RB_CTX_SOFTIRQ; - } else - bit = RB_CTX_NORMAL; - - if (unlikely(val & (1 << bit))) - return 1; - - val |= (1 << bit); - cpu_buffer->current_context = val; - - return 0; -} - -static __always_inline void -trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->current_context &= cpu_buffer->current_context - 1; -} - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2833,111 +2930,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer From 72ac426a5bb0cec572d26b4456f8c1e14601694e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 13:24:54 -0400 Subject: [PATCH 043/734] tracing: Clean up stack tracing and fix fentry updates Akashi Takahiro was porting the stack tracer to arm64 and found some issues with it. One was that it repeats the top function, due to the stack frame added by the mcount caller and added by itself. This was added when fentry came in, and before fentry created its own stack frame. But x86's fentry now creates its own stack frame, and there's no need to insert the function again. This also cleans up the code a bit, where it doesn't need to do something special for fentry, and doesn't include insertion of a duplicate entry for the called function being traced. Link: http://lkml.kernel.org/r/55A646EE.6030402@linaro.org Some-suggestions-by: Jungseok Lee Some-suggestions-by: Mark Rutland Reported-by: AKASHI Takahiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 68 +++++++++++++------------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e936..b746399ab59c01 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); From 8e436ca042d904533a1e14fdc85f0facdfca752f Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:58:08 +0530 Subject: [PATCH 044/734] ftrace: add tracing_thresh to function profile This patch extends tracing_thresh functionality to function profile tracer. If tracing_thresh is set, print those entries only, whose average is > tracing thresh. Link: http://lkml.kernel.org/r/1434972488-8571-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari [ Removed unnecessary 'moved' comment ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea36..f46dbb5cdf762b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -613,13 +613,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) From 82c355e81afbf16bc1ab379899a79eb66e2b7504 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 21:58:52 -0400 Subject: [PATCH 045/734] ftrace: Fix function_graph duration spacing with 7-digits Jungseok Lee noticed the following: Currently, row's width of 7-digit duration numbers not aligned with other cases like the following example. 3) $ 3999884 us | } 3) | finish_task_switch() { 3) 0.365 us | _raw_spin_unlock_irq(); 3) 3.333 us | } 3) $ 3999976 us | } 3) $ 3999979 us | } /* schedule */ As adding a single white space in case of 7-digit numbers, the format could be unified easily as follows. 3) $ 2237472 us | } 3) | finish_task_switch() { 3) 0.364 us | _raw_spin_unlock_irq(); 3) 3.125 us | } 3) $ 2237556 us | } 3) $ 2237559 us | } /* schedule */ Instead of making a special case for 7-digit numbers, the logic of the len and the space loop is slightly modified to make the two cases have the same format. Link: http://lkml.kernel.org/r/1436626300-1679-2-git-send-email-jungseoklee85@gmail.com Reported-by: Jungseok Lee Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c1259..ca98445782acaa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } From b838e1d96c613019095ba008afbee800977b0582 Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Sat, 11 Jul 2015 14:51:40 +0000 Subject: [PATCH 046/734] tracing: Introduce two additional marks for delay A fine granulity support for delay would be very useful when profiling VM logics, such as page allocation including page reclaim and memory compaction with function graph. Thus, this patch adds two additional marks with two changes. - An equal sign in mark selection function is removed to align code behavior with comments and documentation. - The function graph example related to delay in ftrace.txt is updated to cover all supported marks. Link: http://lkml.kernel.org/r/1436626300-1679-3-git-send-email-jungseoklee85@gmail.com Cc: Byungchul Park Signed-off-by: Jungseok Lee Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 51 +++++++++++++++++++++++++--------- kernel/trace/trace_output.c | 4 ++- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 7ddb1e319f84d1..072d3c4d575393 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -686,6 +686,8 @@ The above is mostly meaningful for kernel developers. The marks are determined by the difference between this current trace and the next trace. '$' - greater than 1 second + '@' - greater than 100 milisecond + '*' - greater than 10 milisecond '#' - greater than 1000 microsecond '!' - greater than 100 microsecond '+' - greater than 10 microsecond @@ -1939,26 +1941,49 @@ want, depending on your needs. ie: - 0) | up_write() { - 0) 0.646 us | _spin_lock_irqsave(); - 0) 0.684 us | _spin_unlock_irqrestore(); - 0) 3.123 us | } - 0) 0.548 us | fput(); - 0) + 58.628 us | } + 3) # 1837.709 us | } /* __switch_to */ + 3) | finish_task_switch() { + 3) 0.313 us | _raw_spin_unlock_irq(); + 3) 3.177 us | } + 3) # 1889.063 us | } /* __schedule */ + 3) ! 140.417 us | } /* __schedule */ + 3) # 2034.948 us | } /* schedule */ + 3) * 33998.59 us | } /* schedule_preempt_disabled */ [...] - 0) | putname() { - 0) | kmem_cache_free() { - 0) 0.518 us | __phys_addr(); - 0) 1.757 us | } - 0) 2.861 us | } - 0) ! 115.305 us | } - 0) ! 116.402 us | } + 1) 0.260 us | msecs_to_jiffies(); + 1) 0.313 us | __rcu_read_unlock(); + 1) + 61.770 us | } + 1) + 64.479 us | } + 1) 0.313 us | rcu_bh_qs(); + 1) 0.313 us | __local_bh_enable(); + 1) ! 217.240 us | } + 1) 0.365 us | idle_cpu(); + 1) | rcu_irq_exit() { + 1) 0.417 us | rcu_eqs_enter_common.isra.47(); + 1) 3.125 us | } + 1) ! 227.812 us | } + 1) ! 457.395 us | } + 1) @ 119760.2 us | } + + [...] + + 2) | handle_IPI() { + 1) 6.979 us | } + 2) 0.417 us | scheduler_ipi(); + 1) 9.791 us | } + 1) + 12.917 us | } + 2) 3.490 us | } + 1) + 15.729 us | } + 1) + 18.542 us | } + 2) $ 3594274 us | } + means that the function exceeded 10 usecs. ! means that the function exceeded 100 usecs. # means that the function exceeded 1000 usecs. + * means that the function exceeded 10 msecs. + @ means that the function exceeded 100 msecs. $ means that the function exceeded 1 sec. diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc9e..8e481a84aeea79 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } From c93bf928fea22c61f6b5c04786b325c9bfbc0462 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Sun, 12 Jul 2015 17:52:24 +0800 Subject: [PATCH 047/734] ftrace: Format MCOUNT_ADDR address as type unsigned long Always we use type unsigned long to format the ip address, since the value of ip address is never the negative. This patch uses type unsigned long, instead of long, to format the ip address. The code is more clearly to be viewed by using type unsigned long, although it is correct by using either unsigned long or long. Link: http://lkml.kernel.org/r/1436694744-16747-1-git-send-email-mhuang@redhat.com Cc: Minfei Huang Cc: "H. Peter Anvin" Cc: James Hogan Cc: Michal Simek Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Signed-off-by: Minfei Huang Signed-off-by: Steven Rostedt --- arch/metag/include/asm/ftrace.h | 2 +- arch/microblaze/include/asm/ftrace.h | 2 +- arch/powerpc/include/asm/ftrace.h | 2 +- arch/sh/include/asm/ftrace.h | 2 +- arch/sparc/include/asm/ftrace.h | 2 +- arch/x86/include/asm/ftrace.h | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/metag/include/asm/ftrace.h b/arch/metag/include/asm/ftrace.h index 2901f0f7d944ec..a2269d60a945bc 100644 --- a/arch/metag/include/asm/ftrace.h +++ b/arch/metag/include/asm/ftrace.h @@ -6,7 +6,7 @@ #ifndef __ASSEMBLY__ extern void mcount_wrapper(void); -#define MCOUNT_ADDR ((long)(mcount_wrapper)) +#define MCOUNT_ADDR ((unsigned long)(mcount_wrapper)) static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index fd2fa2eca62f18..da0144f40d99d5 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -3,7 +3,7 @@ #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 8 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index e3661872fbea5b..ef89b146557310 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_POWERPC_FTRACE #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifdef __ASSEMBLY__ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index e79fb6ebaa4237..1f157b86eaa7a1 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -9,7 +9,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_ADDR ((unsigned long)(mcount)) #ifdef CONFIG_DYNAMIC_FTRACE #define CALL_ADDR ((long)(ftrace_call)) diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 9ec94ad116fbdd..3192a8e42fd62c 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_SPARC64_FTRACE #ifdef CONFIG_MCOUNT -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c4b678..24938852db3013 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ From 3bf2789cad9e6573dc19a6c3d123c2c049f2d90f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi Date: Mon, 22 Jun 2015 15:36:06 +0530 Subject: [PATCH 048/734] smack: allow mount opts setting over filesystems with binary mount data Add support for setting smack mount labels(using smackfsdef, smackfsroot, smackfshat, smackfsfloor, smackfstransmute) for filesystems with binary mount data like NFS. To achieve this, implement sb_parse_opts_str and sb_set_mnt_opts security operations in smack LSM similar to SELinux. Signed-off-by: Vivek Trivedi Signed-off-by: Amit Sahrawat Acked-by: Casey Schaufler --- security/smack/smack.h | 18 +++ security/smack/smack_lsm.c | 241 +++++++++++++++++++++++++++++++------ 2 files changed, 219 insertions(+), 40 deletions(-) diff --git a/security/smack/smack.h b/security/smack/smack.h index 244e035e5a99f3..69ab9eb7d6d927 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -143,6 +143,24 @@ struct smack_onlycap { struct smack_known *smk_label; }; +/* Super block security struct flags for mount options */ +#define FSDEFAULT_MNT 0x01 +#define FSFLOOR_MNT 0x02 +#define FSHAT_MNT 0x04 +#define FSROOT_MNT 0x08 +#define FSTRANS_MNT 0x10 + +#define NUM_SMK_MNT_OPTS 5 + +enum { + Opt_error = -1, + Opt_fsdefault = 1, + Opt_fsfloor = 2, + Opt_fshat = 3, + Opt_fsroot = 4, + Opt_fstransmute = 5, +}; + /* * Mount options */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index a143328f75ebb0..d962f887d3f445 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "smack.h" #define TRANS_TRUE "TRUE" @@ -64,6 +65,15 @@ static char *smk_bu_mess[] = { "Unconfined Object", /* SMACK_UNCONFINED_OBJECT */ }; +static const match_table_t tokens = { + {Opt_fsdefault, SMK_FSDEFAULT "%s"}, + {Opt_fsfloor, SMK_FSFLOOR "%s"}, + {Opt_fshat, SMK_FSHAT "%s"}, + {Opt_fsroot, SMK_FSROOT "%s"}, + {Opt_fstransmute, SMK_FSTRANS "%s"}, + {Opt_error, NULL}, +}; + static void smk_bu_mode(int mode, char *s) { int i = 0; @@ -577,76 +587,193 @@ static int smack_sb_copy_data(char *orig, char *smackopts) } /** - * smack_sb_kern_mount - Smack specific mount processing + * smack_parse_opts_str - parse Smack specific mount options + * @options: mount options string + * @opts: where to store converted mount opts + * + * Returns 0 on success or -ENOMEM on error. + * + * converts Smack specific mount options to generic security option format + */ +static int smack_parse_opts_str(char *options, + struct security_mnt_opts *opts) +{ + char *p; + char *fsdefault = NULL, *fsfloor = NULL; + char *fshat = NULL, *fsroot = NULL, *fstransmute = NULL; + int rc = -ENOMEM, num_mnt_opts = 0; + + opts->num_mnt_opts = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + + switch (token) { + case Opt_fsdefault: + if (fsdefault) + goto out_opt_err; + fsdefault = match_strdup(&args[0]); + if (!fsdefault) + goto out_err; + break; + case Opt_fsfloor: + if (fsfloor) + goto out_opt_err; + fsfloor = match_strdup(&args[0]); + if (!fsfloor) + goto out_err; + break; + case Opt_fshat: + if (fshat) + goto out_opt_err; + fshat = match_strdup(&args[0]); + if (!fshat) + goto out_err; + break; + case Opt_fsroot: + if (fsroot) + goto out_opt_err; + fsroot = match_strdup(&args[0]); + if (!fsroot) + goto out_err; + break; + case Opt_fstransmute: + if (fstransmute) + goto out_opt_err; + fstransmute = match_strdup(&args[0]); + if (!fstransmute) + goto out_err; + break; + default: + rc = -EINVAL; + pr_warn("Smack: unknown mount option\n"); + goto out_err; + } + } + + opts->mnt_opts = kcalloc(NUM_SMK_MNT_OPTS, sizeof(char *), GFP_ATOMIC); + if (!opts->mnt_opts) + goto out_err; + + opts->mnt_opts_flags = kcalloc(NUM_SMK_MNT_OPTS, sizeof(int), + GFP_ATOMIC); + if (!opts->mnt_opts_flags) { + kfree(opts->mnt_opts); + goto out_err; + } + + if (fsdefault) { + opts->mnt_opts[num_mnt_opts] = fsdefault; + opts->mnt_opts_flags[num_mnt_opts++] = FSDEFAULT_MNT; + } + if (fsfloor) { + opts->mnt_opts[num_mnt_opts] = fsfloor; + opts->mnt_opts_flags[num_mnt_opts++] = FSFLOOR_MNT; + } + if (fshat) { + opts->mnt_opts[num_mnt_opts] = fshat; + opts->mnt_opts_flags[num_mnt_opts++] = FSHAT_MNT; + } + if (fsroot) { + opts->mnt_opts[num_mnt_opts] = fsroot; + opts->mnt_opts_flags[num_mnt_opts++] = FSROOT_MNT; + } + if (fstransmute) { + opts->mnt_opts[num_mnt_opts] = fstransmute; + opts->mnt_opts_flags[num_mnt_opts++] = FSTRANS_MNT; + } + + opts->num_mnt_opts = num_mnt_opts; + return 0; + +out_opt_err: + rc = -EINVAL; + pr_warn("Smack: duplicate mount options\n"); + +out_err: + kfree(fsdefault); + kfree(fsfloor); + kfree(fshat); + kfree(fsroot); + kfree(fstransmute); + return rc; +} + +/** + * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock - * @flags: the mount flags - * @data: the smack mount options + * @opts: Smack mount options + * @kern_flags: mount option from kernel space or user space + * @set_kern_flags: where to store converted mount opts * * Returns 0 on success, an error code on failure + * + * Allow filesystems with binary mount data to explicitly set Smack mount + * labels. */ -static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) +static int smack_set_mnt_opts(struct super_block *sb, + struct security_mnt_opts *opts, + unsigned long kern_flags, + unsigned long *set_kern_flags) { struct dentry *root = sb->s_root; struct inode *inode = d_backing_inode(root); struct superblock_smack *sp = sb->s_security; struct inode_smack *isp; struct smack_known *skp; - char *op; - char *commap; + int i; + int num_opts = opts->num_mnt_opts; int transmute = 0; - int specified = 0; if (sp->smk_initialized) return 0; sp->smk_initialized = 1; - for (op = data; op != NULL; op = commap) { - commap = strchr(op, ','); - if (commap != NULL) - *commap++ = '\0'; - - if (strncmp(op, SMK_FSHAT, strlen(SMK_FSHAT)) == 0) { - op += strlen(SMK_FSHAT); - skp = smk_import_entry(op, 0); + for (i = 0; i < num_opts; i++) { + switch (opts->mnt_opts_flags[i]) { + case FSDEFAULT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); - sp->smk_hat = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSFLOOR, strlen(SMK_FSFLOOR)) == 0) { - op += strlen(SMK_FSFLOOR); - skp = smk_import_entry(op, 0); + sp->smk_default = skp; + break; + case FSFLOOR_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_floor = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSDEFAULT, - strlen(SMK_FSDEFAULT)) == 0) { - op += strlen(SMK_FSDEFAULT); - skp = smk_import_entry(op, 0); + break; + case FSHAT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); - sp->smk_default = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSROOT, strlen(SMK_FSROOT)) == 0) { - op += strlen(SMK_FSROOT); - skp = smk_import_entry(op, 0); + sp->smk_hat = skp; + break; + case FSROOT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSTRANS, strlen(SMK_FSTRANS)) == 0) { - op += strlen(SMK_FSTRANS); - skp = smk_import_entry(op, 0); + break; + case FSTRANS_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; transmute = 1; - specified = 1; + break; + default: + break; } } @@ -654,7 +781,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) /* * Unprivileged mounts don't get to specify Smack values. */ - if (specified) + if (num_opts) return -EPERM; /* * Unprivileged mounts get root and default from the caller. @@ -663,6 +790,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) sp->smk_root = skp; sp->smk_default = skp; } + /* * Initialize the root inode. */ @@ -681,6 +809,37 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) return 0; } +/** + * smack_sb_kern_mount - Smack specific mount processing + * @sb: the file system superblock + * @flags: the mount flags + * @data: the smack mount options + * + * Returns 0 on success, an error code on failure + */ +static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) +{ + int rc = 0; + char *options = data; + struct security_mnt_opts opts; + + security_init_mnt_opts(&opts); + + if (!options) + goto out; + + rc = smack_parse_opts_str(options, &opts); + if (rc) + goto out_err; + +out: + rc = smack_set_mnt_opts(sb, &opts, 0, NULL); + +out_err: + security_free_mnt_opts(&opts); + return rc; +} + /** * smack_sb_statfs - Smack check on statfs * @dentry: identifies the file system in question @@ -4264,6 +4423,8 @@ struct security_hook_list smack_hooks[] = { LSM_HOOK_INIT(sb_copy_data, smack_sb_copy_data), LSM_HOOK_INIT(sb_kern_mount, smack_sb_kern_mount), LSM_HOOK_INIT(sb_statfs, smack_sb_statfs), + LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts), + LSM_HOOK_INIT(sb_parse_opts_str, smack_parse_opts_str), LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds), LSM_HOOK_INIT(bprm_committing_creds, smack_bprm_committing_creds), From ca70d27e445fe721587598030b97357b35f61913 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 24 Jun 2015 07:41:07 +0800 Subject: [PATCH 049/734] sysfs: fix simple_return.cocci warnings security/smack/smackfs.c:2251:1-4: WARNING: end returns can be simpified and declaration on line 2250 can be dropped Simplify a trivial if-return sequence. Possibly combine with a preceding function call. Generated by: scripts/coccinelle/misc/simple_return.cocci Signed-off-by: Fengguang Wu Acked-by: Serge Hallyn Acked-by: Casey Schaufler --- security/smack/smackfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 2716d02119f3e8..81a2888a990863 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -2320,11 +2320,7 @@ static const struct file_operations smk_revoke_subj_ops = { */ static int smk_init_sysfs(void) { - int err; - err = sysfs_create_mount_point(fs_kobj, "smackfs"); - if (err) - return err; - return 0; + return sysfs_create_mount_point(fs_kobj, "smackfs"); } /** From 7deef550f3a7d44c1d52a6d54f824e7e180c08ae Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:10 -0600 Subject: [PATCH 050/734] toshiba_acpi: Adapt /proc/acpi/toshiba/keys to TOS1900 devices Since the introduction of TOS1900 devices support to the driver, the "keys" entry under the proc directory was broken, given that it only handled TOS620X devices accordingly. This patch adapts the code to show the hotkey values of TOS1900 devices too, and in case some programs are still using that interface, hotkeys reporting should now work on these devices. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 56 +++++++++++------------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 3ad7b1fa24ce54..c3a0c4d0c1dc4e 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1499,32 +1499,10 @@ static const struct file_operations fan_proc_fops = { static int keys_proc_show(struct seq_file *m, void *v) { struct toshiba_acpi_dev *dev = m->private; - u32 hci_result; - u32 value; - - if (!dev->key_event_valid && dev->system_event_supported) { - hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value); - if (hci_result == TOS_SUCCESS) { - dev->key_event_valid = 1; - dev->last_key_event = value; - } else if (hci_result == TOS_FIFO_EMPTY) { - /* Better luck next time */ - } else if (hci_result == TOS_NOT_SUPPORTED) { - /* - * This is a workaround for an unresolved issue on - * some machines where system events sporadically - * become disabled. - */ - hci_result = hci_write(dev, HCI_SYSTEM_EVENT, 1); - pr_notice("Re-enabled hotkeys\n"); - } else { - pr_err("Error reading hotkey status\n"); - return -EIO; - } - } seq_printf(m, "hotkey_ready: %d\n", dev->key_event_valid); seq_printf(m, "hotkey: 0x%04x\n", dev->last_key_event); + return 0; } @@ -2361,22 +2339,28 @@ static void toshiba_acpi_report_hotkey(struct toshiba_acpi_dev *dev, static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev) { - u32 hci_result, value; - int retries = 3; - int scancode; - if (dev->info_supported) { - scancode = toshiba_acpi_query_hotkey(dev); - if (scancode < 0) + int scancode = toshiba_acpi_query_hotkey(dev); + + if (scancode < 0) { pr_err("Failed to query hotkey event\n"); - else if (scancode != 0) + } else if (scancode != 0) { toshiba_acpi_report_hotkey(dev, scancode); + dev->key_event_valid = 1; + dev->last_key_event = scancode; + } } else if (dev->system_event_supported) { + u32 result; + u32 value; + int retries = 3; + do { - hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value); - switch (hci_result) { + result = hci_read(dev, HCI_SYSTEM_EVENT, &value); + switch (result) { case TOS_SUCCESS: toshiba_acpi_report_hotkey(dev, (int)value); + dev->key_event_valid = 1; + dev->last_key_event = value; break; case TOS_NOT_SUPPORTED: /* @@ -2384,15 +2368,15 @@ static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev) * issue on some machines where system events * sporadically become disabled. */ - hci_result = - hci_write(dev, HCI_SYSTEM_EVENT, 1); - pr_notice("Re-enabled hotkeys\n"); + result = hci_write(dev, HCI_SYSTEM_EVENT, 1); + if (result == TOS_SUCCESS) + pr_notice("Re-enabled hotkeys\n"); /* Fall through */ default: retries--; break; } - } while (retries && hci_result != TOS_FIFO_EMPTY); + } while (retries && result != TOS_FIFO_EMPTY); } } From fc5462f8525b47fa219452289ecb22c921c16823 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:11 -0600 Subject: [PATCH 051/734] toshiba_acpi: Add /dev/toshiba_acpi device There were previous attempts to "merge" the toshiba SMM module to the toshiba_acpi one, they were trying to imitate what the old toshiba module does, however, some models (TOS1900 devices) come with a "crippled" implementation and do not provide all the "features" a "genuine" Toshiba BIOS does. This patch adds a new device called toshiba_acpi, which aim is to enable userspace to access the SMM on Toshiba laptops via ACPI calls. Creating a new convenience _IOWR command to access the SCI functions by opening/closing the SCI internally to avoid buggy BIOS, while at the same time providing backwards compatibility. Older programs (and new) who wish to access the SMM on newer models can do it by pointing their path to /dev/toshiba_acpi (instead of /dev/toshiba) as the toshiba.h header was modified to reflect these changes as well as adds all the toshiba_acpi paths and command, however, it is strongly recommended to use the new IOCTL for any SCI command to avoid any buggy BIOS. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- Documentation/ioctl/ioctl-number.txt | 2 +- drivers/platform/x86/toshiba_acpi.c | 91 ++++++++++++++++++++++++++++ include/uapi/linux/toshiba.h | 32 +++++++++- 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 611c52267d2481..21d2f27c886b4d 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -263,7 +263,7 @@ Code Seq#(hex) Include File Comments 's' all linux/cdk.h 't' 00-7F linux/ppp-ioctl.h 't' 80-8F linux/isdn_ppp.h -'t' 90 linux/toshiba.h +'t' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM 'u' 00-1F linux/smb_fs.h gone 'u' 20-3F linux/uvcvideo.h USB video class host driver 'v' 00-1F linux/ext2_fs.h conflict! diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index c3a0c4d0c1dc4e..802577f43a2337 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include MODULE_AUTHOR("John Belmonte"); @@ -170,6 +172,7 @@ struct toshiba_acpi_dev { struct led_classdev led_dev; struct led_classdev kbd_led; struct led_classdev eco_led; + struct miscdevice miscdev; int force_fan; int last_key_event; @@ -2239,6 +2242,81 @@ static struct attribute_group toshiba_attr_group = { .attrs = toshiba_attributes, }; +/* + * Misc device + */ +static int toshiba_acpi_smm_bridge(SMMRegisters *regs) +{ + u32 in[TCI_WORDS] = { regs->eax, regs->ebx, regs->ecx, + regs->edx, regs->esi, regs->edi }; + u32 out[TCI_WORDS]; + acpi_status status; + + status = tci_raw(toshiba_acpi, in, out); + if (ACPI_FAILURE(status)) { + pr_err("ACPI call to query SMM registers failed\n"); + return -EIO; + } + + /* Fillout the SMM struct with the TCI call results */ + regs->eax = out[0]; + regs->ebx = out[1]; + regs->ecx = out[2]; + regs->edx = out[3]; + regs->esi = out[4]; + regs->edi = out[5]; + + return 0; +} + +static long toshiba_acpi_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + SMMRegisters __user *argp = (SMMRegisters __user *)arg; + SMMRegisters regs; + int ret; + + if (!argp) + return -EINVAL; + + switch (cmd) { + case TOSH_SMM: + if (copy_from_user(®s, argp, sizeof(SMMRegisters))) + return -EFAULT; + ret = toshiba_acpi_smm_bridge(®s); + if (ret) + return ret; + if (copy_to_user(argp, ®s, sizeof(SMMRegisters))) + return -EFAULT; + break; + case TOSHIBA_ACPI_SCI: + if (copy_from_user(®s, argp, sizeof(SMMRegisters))) + return -EFAULT; + /* Ensure we are being called with a SCI_{GET, SET} register */ + if (regs.eax != SCI_GET && regs.eax != SCI_SET) + return -EINVAL; + if (!sci_open(toshiba_acpi)) + return -EIO; + ret = toshiba_acpi_smm_bridge(®s); + sci_close(toshiba_acpi); + if (ret) + return ret; + if (copy_to_user(argp, ®s, sizeof(SMMRegisters))) + return -EFAULT; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct file_operations toshiba_acpi_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = toshiba_acpi_ioctl, + .llseek = noop_llseek, +}; + /* * Hotkeys */ @@ -2540,6 +2618,8 @@ static int toshiba_acpi_remove(struct acpi_device *acpi_dev) { struct toshiba_acpi_dev *dev = acpi_driver_data(acpi_dev); + misc_deregister(&dev->miscdev); + remove_toshiba_proc_entries(dev); if (dev->sysfs_created) @@ -2611,6 +2691,17 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) return -ENOMEM; dev->acpi_dev = acpi_dev; dev->method_hci = hci_method; + dev->miscdev.minor = MISC_DYNAMIC_MINOR; + dev->miscdev.name = "toshiba_acpi"; + dev->miscdev.fops = &toshiba_acpi_fops; + + ret = misc_register(&dev->miscdev); + if (ret) { + pr_err("Failed to register miscdevice\n"); + kfree(dev); + return ret; + } + acpi_dev->driver_data = dev; dev_set_drvdata(&acpi_dev->dev, dev); diff --git a/include/uapi/linux/toshiba.h b/include/uapi/linux/toshiba.h index e9bef5b2f91ebf..c58bf4b5bb2664 100644 --- a/include/uapi/linux/toshiba.h +++ b/include/uapi/linux/toshiba.h @@ -1,6 +1,7 @@ /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops * * Copyright (c) 1996-2000 Jonathan A. Buzzard (jonathan@buzzard.org.uk) + * Copyright (c) 2015 Azael Avalos * * Thanks to Juergen Heinzl for the pointers * on making sure the structure is aligned and packed. @@ -20,9 +21,18 @@ #ifndef _UAPI_LINUX_TOSHIBA_H #define _UAPI_LINUX_TOSHIBA_H -#define TOSH_PROC "/proc/toshiba" -#define TOSH_DEVICE "/dev/toshiba" -#define TOSH_SMM _IOWR('t', 0x90, int) /* broken: meant 24 bytes */ +/* + * Toshiba modules paths + */ + +#define TOSH_PROC "/proc/toshiba" +#define TOSH_DEVICE "/dev/toshiba" +#define TOSHIBA_ACPI_PROC "/proc/acpi/toshiba" +#define TOSHIBA_ACPI_DEVICE "/dev/toshiba_acpi" + +/* + * Toshiba SMM structure + */ typedef struct { unsigned int eax; @@ -33,5 +43,21 @@ typedef struct { unsigned int edi __attribute__ ((packed)); } SMMRegisters; +/* + * IOCTLs (0x90 - 0x91) + */ + +#define TOSH_SMM _IOWR('t', 0x90, SMMRegisters) +/* + * Convenience toshiba_acpi command. + * + * The System Configuration Interface (SCI) is opened/closed internally + * to avoid userspace of buggy BIOSes. + * + * The toshiba_acpi module checks whether the eax register is set with + * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not. + */ +#define TOSHIBA_ACPI_SCI _IOWR('t', 0x91, SMMRegisters) + #endif /* _UAPI_LINUX_TOSHIBA_H */ From a88bc06e5aec4984f5bf01c6d410a0939134f737 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:12 -0600 Subject: [PATCH 052/734] toshiba_acpi: Avoid registering input device on WMI event laptops Commit f11f999e9890 ("toshiba_acpi: Refuse to load on machines with buggy INFO implementations") denied loading on laptops with a WMI Event GUID given that such laptops manage the hotkeys via that interface, however, such laptops have a working Toshiba Configuration Interface (TCI), and thus, such commit denied several supported features. This patch avoids registering the input device and ignores all hotkey events on laptops with such WMI Event GUID, making the supported features found in those laptops to work. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 802577f43a2337..48b16b323c8904 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -2466,6 +2466,11 @@ static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev) u32 hci_result; int error; + if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) { + pr_info("WMI event detected, hotkeys will not be monitored\n"); + return 0; + } + error = toshiba_acpi_enable_hotkeys(dev); if (error) return error; @@ -2813,6 +2818,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) switch (event) { case 0x80: /* Hotkeys and some system events */ + /* + * Machines with this WMI GUID aren't supported due to bugs in + * their AML. + * + * Return silently to avoid triggering a netlink event. + */ + if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) + return; toshiba_acpi_process_hotkeys(dev); break; case 0x81: /* Dock events */ @@ -2899,14 +2912,6 @@ static int __init toshiba_acpi_init(void) { int ret; - /* - * Machines with this WMI guid aren't supported due to bugs in - * their AML. This check relies on wmi initializing before - * toshiba_acpi to guarantee guids have been identified. - */ - if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) - return -ENODEV; - toshiba_proc_dir = proc_mkdir(PROC_TOSHIBA, acpi_root_dir); if (!toshiba_proc_dir) { pr_err("Unable to create proc dir " PROC_TOSHIBA "\n"); From 695f6060903cefa08ffb78433136f51ac0f94488 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:13 -0600 Subject: [PATCH 053/734] toshiba_acpi: Transflective backlight updates This patch changes the tr function second parameter from bool to u32, to be on par with the rest of the TCI functions of the driver, and the code was updated accordingly. Also, the check for translective support was moved to the *add function, as the {__get, set}_lcd_brightness functions make use of it. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 30 +++++++++++------------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 48b16b323c8904..649786de4a79d7 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1187,22 +1187,17 @@ static int toshiba_hotkey_event_type_get(struct toshiba_acpi_dev *dev, } /* Transflective Backlight */ -static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, bool *enabled) +static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 *status) { - u32 hci_result; - u32 status; + u32 hci_result = hci_read(dev, HCI_TR_BACKLIGHT, status); - hci_result = hci_read(dev, HCI_TR_BACKLIGHT, &status); - *enabled = !status; return hci_result == TOS_SUCCESS ? 0 : -EIO; } -static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, bool enable) +static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 status) { - u32 hci_result; - u32 value = !enable; + u32 hci_result = hci_write(dev, HCI_TR_BACKLIGHT, !status); - hci_result = hci_write(dev, HCI_TR_BACKLIGHT, value); return hci_result == TOS_SUCCESS ? 0 : -EIO; } @@ -1216,12 +1211,11 @@ static int __get_lcd_brightness(struct toshiba_acpi_dev *dev) int brightness = 0; if (dev->tr_backlight_supported) { - bool enabled; - int ret = get_tr_backlight_status(dev, &enabled); + int ret = get_tr_backlight_status(dev, &value); if (ret) return ret; - if (enabled) + if (value) return 0; brightness++; } @@ -1271,8 +1265,7 @@ static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value) u32 hci_result; if (dev->tr_backlight_supported) { - bool enable = !value; - int ret = set_tr_backlight_status(dev, enable); + int ret = set_tr_backlight_status(dev, !value); if (ret) return ret; @@ -2563,7 +2556,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev) struct backlight_properties props; int brightness; int ret; - bool enabled; /* * Some machines don't support the backlight methods at all, and @@ -2580,10 +2572,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev) return 0; } - /* Determine whether or not BIOS supports transflective backlight */ - ret = get_tr_backlight_status(dev, &enabled); - dev->tr_backlight_supported = !ret; - /* * Tell acpi-video-detect code to prefer vendor backlight on all * systems with transflective backlight and on dmi matched systems. @@ -2723,6 +2711,10 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) if (toshiba_acpi_setup_keyboard(dev)) pr_info("Unable to activate hotkeys\n"); + /* Determine whether or not BIOS supports transflective backlight */ + ret = get_tr_backlight_status(dev, &dummy); + dev->tr_backlight_supported = !ret; + ret = toshiba_acpi_setup_backlight(dev); if (ret) goto error; From d7e4f2e2ca392bce468718bcbba808108d81d501 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:47 -0600 Subject: [PATCH 054/734] toshiba_acpi: Remove unused wireless defines Commit 2b74103547b4 ("toshiba_acpi: Remove bluetooth rfkill code") removed bluetooth related code, however, the wireless defines were not removed and are unused. This patch simply removes those defines as there is no code using them. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 649786de4a79d7..90d8cb1c2e27ad 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -113,7 +113,6 @@ MODULE_LICENSE("GPL"); #define HCI_VIDEO_OUT 0x001c #define HCI_HOTKEY_EVENT 0x001e #define HCI_LCD_BRIGHTNESS 0x002a -#define HCI_WIRELESS 0x0056 #define HCI_ACCELEROMETER 0x006d #define HCI_KBD_ILLUMINATION 0x0095 #define HCI_ECO_MODE 0x0097 @@ -142,10 +141,6 @@ MODULE_LICENSE("GPL"); #define HCI_VIDEO_OUT_LCD 0x1 #define HCI_VIDEO_OUT_CRT 0x2 #define HCI_VIDEO_OUT_TV 0x4 -#define HCI_WIRELESS_KILL_SWITCH 0x01 -#define HCI_WIRELESS_BT_PRESENT 0x0f -#define HCI_WIRELESS_BT_ATTACH 0x40 -#define HCI_WIRELESS_BT_POWER 0x80 #define SCI_KBD_MODE_MASK 0x1f #define SCI_KBD_MODE_FNZ 0x1 #define SCI_KBD_MODE_AUTO 0x2 From d50c9005d32b4eda6e11f7ec4f1b00a93088e0ca Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:46 -0600 Subject: [PATCH 055/734] toshiba_acpi: Reorder toshiba_acpi_alt_keymap entries This patch simply reorders the entries found in the new keymap by ascending order, this is simply a cosmetic change, no functionality was modified. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 90d8cb1c2e27ad..6013a11caeea81 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -246,16 +246,16 @@ static const struct key_entry toshiba_acpi_keymap[] = { }; static const struct key_entry toshiba_acpi_alt_keymap[] = { - { KE_KEY, 0x157, { KEY_MUTE } }, { KE_KEY, 0x102, { KEY_ZOOMOUT } }, { KE_KEY, 0x103, { KEY_ZOOMIN } }, { KE_KEY, 0x12c, { KEY_KBDILLUMTOGGLE } }, { KE_KEY, 0x139, { KEY_ZOOMRESET } }, - { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } }, { KE_KEY, 0x13c, { KEY_BRIGHTNESSDOWN } }, { KE_KEY, 0x13d, { KEY_BRIGHTNESSUP } }, - { KE_KEY, 0x158, { KEY_WLAN } }, + { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } }, { KE_KEY, 0x13f, { KEY_TOUCHPAD_TOGGLE } }, + { KE_KEY, 0x157, { KEY_MUTE } }, + { KE_KEY, 0x158, { KEY_WLAN } }, { KE_END, 0 }, }; From 1e574dbfadafd9fd1f2a414efb731d7538277e71 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:49 -0600 Subject: [PATCH 056/734] toshiba_acpi: Change some variables to avoid warnings from ninja-check This patch changes some variables to avoid warnings from ninja-check. We are basically moving some variables inside the conditionals where such variables are being used, and we are checking the returned values of some others. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 6013a11caeea81..3bfdfddc38ac3d 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1651,7 +1651,6 @@ static ssize_t kbd_backlight_mode_store(struct device *dev, { struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev); int mode; - int time; int ret; @@ -1682,7 +1681,7 @@ static ssize_t kbd_backlight_mode_store(struct device *dev, /* Only make a change if the actual mode has changed */ if (toshiba->kbd_mode != mode) { /* Shift the time to "base time" (0x3c0000 == 60 seconds) */ - time = toshiba->kbd_time << HCI_MISC_SHIFT; + int time = toshiba->kbd_time << HCI_MISC_SHIFT; /* OR the "base time" to the actual method format */ if (toshiba->kbd_type == 1) { @@ -2856,10 +2855,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) static int toshiba_acpi_suspend(struct device *device) { struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device)); - u32 result; - if (dev->hotkey_dev) + if (dev->hotkey_dev) { + u32 result; + result = hci_write(dev, HCI_HOTKEY_EVENT, HCI_HOTKEY_DISABLE); + if (result != TOS_SUCCESS) + pr_info("Unable to disable hotkeys\n"); + } return 0; } @@ -2867,10 +2870,10 @@ static int toshiba_acpi_suspend(struct device *device) static int toshiba_acpi_resume(struct device *device) { struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device)); - int error; if (dev->hotkey_dev) { - error = toshiba_acpi_enable_hotkeys(dev); + int error = toshiba_acpi_enable_hotkeys(dev); + if (error) pr_info("Unable to re-enable hotkeys\n"); } From 5e32940621eb62064d98f42c9889db71b0368bde Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 11 Jul 2015 10:02:46 -0400 Subject: [PATCH 057/734] libnvdimm, btt: sparse fix Fix: drivers/nvdimm/btt.c:635:29: warning: restricted __le64 degrades to integer Signed-off-by: Dan Williams --- drivers/nvdimm/btt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 411c7b2bb37aec..552f1c4f4dc6cc 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -632,8 +632,9 @@ static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, arena->logoff = arena_off + le64_to_cpu(super->logoff); arena->info2off = arena_off + le64_to_cpu(super->info2off); - arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) : - (arena->info2off - arena->infooff + BTT_PG_SIZE); + arena->size = (le64_to_cpu(super->nextoff) > 0) + ? (le64_to_cpu(super->nextoff)) + : (arena->info2off - arena->infooff + BTT_PG_SIZE); arena->flags = le32_to_cpu(super->flags); } From ec92777f2ba93c00387b8fe53780c25adc57c744 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:35 -0600 Subject: [PATCH 058/734] libnvdimm: Update name of the ars_status_record mask field The spec suggests that this is a simple 'length' field, not a mask. Update the name accordingly. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 2b94ea2287bb92..e94bc20016b2d2 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -87,7 +87,7 @@ struct nd_cmd_ars_status { __u32 handle; __u32 flags; __u64 err_address; - __u64 mask; + __u64 length; } __packed records[0]; } __packed; From 39c686b862cdb2049b90e095b6c6c727b2a7ab60 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:36 -0600 Subject: [PATCH 059/734] libnvdimm: Add DSM support for Address Range Scrub commands Add support for the three ARS DSM commands: - Query ARS Capabilities - Queries the firmware to check if a given range supports scrub, and if so, which type (persistent vs. volatile) - Start ARS - Starts a scrub for a given range/type - Query ARS Status - Checks status of a previously started scrub, and provides the error logs if any. The commands are described by the example DSM spec at: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Also add these commands to the nfit_test test framework, and return canned data. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 1 + drivers/acpi/nfit.h | 1 + include/uapi/linux/ndctl.h | 10 ++ tools/testing/nvdimm/test/nfit.c | 199 ++++++++++++++++++++++--------- 4 files changed, 152 insertions(+), 59 deletions(-) diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 628a42c41ab126..ef8a664db25412 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -868,6 +868,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) struct acpi_device *adev; int i; + nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en; adev = to_acpi_dev(acpi_desc); if (!adev) return; diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index 79b6d83875c1de..f2c2bb751882c3 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -107,6 +107,7 @@ struct acpi_nfit_desc { struct nvdimm_bus *nvdimm_bus; struct device *dev; unsigned long dimm_dsm_force_en; + unsigned long bus_dsm_force_en; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, void *iobuf, u64 len, int rw); }; diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index e94bc20016b2d2..5b4a4be06e2b93 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -111,6 +111,11 @@ enum { ND_CMD_VENDOR = 9, }; +enum { + ND_ARS_VOLATILE = 1, + ND_ARS_PERSISTENT = 2, +}; + static inline const char *nvdimm_bus_cmd_name(unsigned cmd) { static const char * const names[] = { @@ -194,4 +199,9 @@ enum nd_driver_flags { enum { ND_MIN_NAMESPACE_SIZE = 0x00400000, }; + +enum ars_masks { + ARS_STATUS_MASK = 0x0000FFFF, + ARS_EXT_STATUS_SHIFT = 16, +}; #endif /* __NDCTL_H__ */ diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index d0bdae40ccc903..28dba918524e5b 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -147,75 +147,153 @@ static struct nfit_test *to_nfit_test(struct device *dev) return container_of(pdev, struct nfit_test, pdev); } +static int nfit_test_cmd_get_config_size(struct nd_cmd_get_config_size *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->status = 0; + nd_cmd->config_size = LABEL_SIZE; + nd_cmd->max_xfer = SZ_4K; + + return 0; +} + +static int nfit_test_cmd_get_config_data(struct nd_cmd_get_config_data_hdr + *nd_cmd, unsigned int buf_len, void *label) +{ + unsigned int len, offset = nd_cmd->in_offset; + int rc; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) + return -EINVAL; + + nd_cmd->status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(nd_cmd->out_buf, label + offset, len); + rc = buf_len - sizeof(*nd_cmd) - len; + + return rc; +} + +static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd, + unsigned int buf_len, void *label) +{ + unsigned int len, offset = nd_cmd->in_offset; + u32 *status; + int rc; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) + return -EINVAL; + + status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd); + *status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(label + offset, nd_cmd->in_buf, len); + rc = buf_len - sizeof(*nd_cmd) - (len + 4); + + return rc; +} + +static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->max_ars_out = 256; + nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16; + + return 0; +} + +static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->status = 0; + + return 0; +} + +static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->out_length = 256; + nd_cmd->num_records = 0; + nd_cmd->status = 0; + + return 0; +} + static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc); - struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); - int i, rc; + int i, rc = 0; + + if (nvdimm) { + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); - if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) - return -ENOTTY; + if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) + return -ENOTTY; - /* lookup label space for the given dimm */ - for (i = 0; i < ARRAY_SIZE(handle); i++) - if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i]) + /* lookup label space for the given dimm */ + for (i = 0; i < ARRAY_SIZE(handle); i++) + if (__to_nfit_memdev(nfit_mem)->device_handle == + handle[i]) + break; + if (i >= ARRAY_SIZE(handle)) + return -ENXIO; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + rc = nfit_test_cmd_get_config_size(buf, buf_len); break; - if (i >= ARRAY_SIZE(handle)) - return -ENXIO; + case ND_CMD_GET_CONFIG_DATA: + rc = nfit_test_cmd_get_config_data(buf, buf_len, + t->label[i]); + break; + case ND_CMD_SET_CONFIG_DATA: + rc = nfit_test_cmd_set_config_data(buf, buf_len, + t->label[i]); + break; + default: + return -ENOTTY; + } + } else { + if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask)) + return -ENOTTY; - switch (cmd) { - case ND_CMD_GET_CONFIG_SIZE: { - struct nd_cmd_get_config_size *nd_cmd = buf; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - nd_cmd->status = 0; - nd_cmd->config_size = LABEL_SIZE; - nd_cmd->max_xfer = SZ_4K; - rc = 0; - break; - } - case ND_CMD_GET_CONFIG_DATA: { - struct nd_cmd_get_config_data_hdr *nd_cmd = buf; - unsigned int len, offset = nd_cmd->in_offset; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - if (offset >= LABEL_SIZE) - return -EINVAL; - if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) - return -EINVAL; - - nd_cmd->status = 0; - len = min(nd_cmd->in_length, LABEL_SIZE - offset); - memcpy(nd_cmd->out_buf, t->label[i] + offset, len); - rc = buf_len - sizeof(*nd_cmd) - len; - break; - } - case ND_CMD_SET_CONFIG_DATA: { - struct nd_cmd_set_config_hdr *nd_cmd = buf; - unsigned int len, offset = nd_cmd->in_offset; - u32 *status; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - if (offset >= LABEL_SIZE) - return -EINVAL; - if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) - return -EINVAL; - - status = buf + nd_cmd->in_length + sizeof(*nd_cmd); - *status = 0; - len = min(nd_cmd->in_length, LABEL_SIZE - offset); - memcpy(t->label[i] + offset, nd_cmd->in_buf, len); - rc = buf_len - sizeof(*nd_cmd) - (len + 4); - break; - } - default: - return -ENOTTY; + switch (cmd) { + case ND_CMD_ARS_CAP: + rc = nfit_test_cmd_ars_cap(buf, buf_len); + break; + case ND_CMD_ARS_START: + rc = nfit_test_cmd_ars_start(buf, buf_len); + break; + case ND_CMD_ARS_STATUS: + rc = nfit_test_cmd_ars_status(buf, buf_len); + break; + default: + return -ENOTTY; + } } return rc; @@ -876,6 +954,9 @@ static void nfit_test0_setup(struct nfit_test *t) set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en); set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en); nd_desc = &acpi_desc->nd_desc; nd_desc->ndctl = nfit_test_ctl; } From 6b47496a6fc81816e7edaf8224dfb88e402a05f5 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 23 Jul 2015 11:58:48 -0600 Subject: [PATCH 060/734] libnvdimm, pmem: Change pmem physical sector size to PAGE_SIZE Based on a patch: c8fa317 brd: Request from fdisk 4k alignment by Boaz Harrosh, allow fdisk to create properly aligned partitions for DAX. This will also cause mkfs.ext4 to emit a warning if using a file system block size of less than PAGE_SIZE. Cc: Dan Williams Cc: Ross Zwisler Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Elliott, Robert Signed-off-by: Vishal Verma Acked-by: Boaz Harrosh Acked-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ade9eb917a4d94..bcf48f1334431b 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -162,6 +162,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns, return -ENOMEM; blk_queue_make_request(pmem->pmem_queue, pmem_make_request); + blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); From 60e95f43fc8573e81f54b0c1e0bc542c2260d956 Mon Sep 17 00:00:00 2001 From: Linda Knippers Date: Wed, 22 Jul 2015 16:17:22 -0400 Subject: [PATCH 061/734] nfit: Don't check _STA on NVDIMM devices The _STA only applies to the root device, not the individual NVDIMMS, so don't check here. NVDIMM device state flags are checked elsewhere. Signed-off-by: Linda Knippers Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index ef8a664db25412..7c2638f914a909 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -764,9 +764,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM); - unsigned long long sta; - int i, rc = -ENODEV; - acpi_status status; + int i; nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en; adev = to_acpi_dev(acpi_desc); @@ -781,25 +779,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return force_enable_dimms ? 0 : -ENODEV; } - status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta); - if (status == AE_NOT_FOUND) { - dev_dbg(dev, "%s missing _STA, assuming enabled...\n", - dev_name(&adev_dimm->dev)); - rc = 0; - } else if (ACPI_FAILURE(status)) - dev_err(dev, "%s failed to retrieve_STA, disabling...\n", - dev_name(&adev_dimm->dev)); - else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0) - dev_info(dev, "%s disabled by firmware\n", - dev_name(&adev_dimm->dev)); - else - rc = 0; - for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++) if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); - return force_enable_dimms ? 0 : rc; + return 0; } static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) From 730daa164e7c7e31c08fab940549f4acc3329432 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jul 2015 18:02:48 -0700 Subject: [PATCH 062/734] Yama: remove needless CONFIG_SECURITY_YAMA_STACKED Now that minor LSMs can cleanly stack with major LSMs, remove the unneeded config for Yama to be made to explicitly stack. Just selecting the main Yama CONFIG will allow it to work, regardless of the major LSM. Since distros using Yama are already forcing it to stack, this is effectively a no-op change. Additionally add MAINTAINERS entry. Signed-off-by: Kees Cook Signed-off-by: James Morris --- Documentation/security/Yama.txt | 10 ++++----- MAINTAINERS | 6 +++++ arch/mips/configs/pistachio_defconfig | 1 - include/linux/lsm_hooks.h | 6 +++-- security/Kconfig | 5 ----- security/security.c | 11 +++------ security/yama/Kconfig | 9 +------- security/yama/yama_lsm.c | 32 +++++++++------------------ 8 files changed, 28 insertions(+), 52 deletions(-) diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt index 227a63f018a27d..d9ee7d7a6c7fda 100644 --- a/Documentation/security/Yama.txt +++ b/Documentation/security/Yama.txt @@ -1,9 +1,7 @@ -Yama is a Linux Security Module that collects a number of system-wide DAC -security protections that are not handled by the core kernel itself. To -select it at boot time, specify "security=yama" (though this will disable -any other LSM). - -Yama is controlled through sysctl in /proc/sys/kernel/yama: +Yama is a Linux Security Module that collects system-wide DAC security +protections that are not handled by the core kernel itself. This is +selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled +at run-time through sysctls in /proc/sys/kernel/yama: - ptrace_scope diff --git a/MAINTAINERS b/MAINTAINERS index a2264167791acd..f8be2f79719758 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9102,6 +9102,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git S: Supported F: security/apparmor/ +YAMA SECURITY MODULE +M: Kees Cook +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip +S: Supported +F: security/yama/ + SENSABLE PHANTOM M: Jiri Slaby S: Maintained diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig index 1646cce032c34a..642b50946943cc 100644 --- a/arch/mips/configs/pistachio_defconfig +++ b/arch/mips/configs/pistachio_defconfig @@ -320,7 +320,6 @@ CONFIG_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_YAMA_STACKED=y CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_CRYPTO_AUTHENC=y CONFIG_CRYPTO_HMAC=y diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 9429f054c32396..ec3a6bab29de3a 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1881,8 +1881,10 @@ static inline void security_delete_hooks(struct security_hook_list *hooks, extern int __init security_module_enable(const char *module); extern void __init capability_add_hooks(void); -#ifdef CONFIG_SECURITY_YAMA_STACKED -void __init yama_add_hooks(void); +#ifdef CONFIG_SECURITY_YAMA +extern void __init yama_add_hooks(void); +#else +static inline void __init yama_add_hooks(void) { } #endif #endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/security/Kconfig b/security/Kconfig index bf4ec46474b631..e45237897b435f 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -132,7 +132,6 @@ choice default DEFAULT_SECURITY_SMACK if SECURITY_SMACK default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR - default DEFAULT_SECURITY_YAMA if SECURITY_YAMA default DEFAULT_SECURITY_DAC help @@ -151,9 +150,6 @@ choice config DEFAULT_SECURITY_APPARMOR bool "AppArmor" if SECURITY_APPARMOR=y - config DEFAULT_SECURITY_YAMA - bool "Yama" if SECURITY_YAMA=y - config DEFAULT_SECURITY_DAC bool "Unix Discretionary Access Controls" @@ -165,7 +161,6 @@ config DEFAULT_SECURITY default "smack" if DEFAULT_SECURITY_SMACK default "tomoyo" if DEFAULT_SECURITY_TOMOYO default "apparmor" if DEFAULT_SECURITY_APPARMOR - default "yama" if DEFAULT_SECURITY_YAMA default "" if DEFAULT_SECURITY_DAC endmenu diff --git a/security/security.c b/security/security.c index 595fffab48b0a2..e693ffcf9266e0 100644 --- a/security/security.c +++ b/security/security.c @@ -56,18 +56,13 @@ int __init security_init(void) pr_info("Security Framework initialized\n"); /* - * Always load the capability module. + * Load minor LSMs, with the capability module always first. */ capability_add_hooks(); -#ifdef CONFIG_SECURITY_YAMA_STACKED - /* - * If Yama is configured for stacking load it next. - */ yama_add_hooks(); -#endif + /* - * Load the chosen module if there is one. - * This will also find yama if it is stacking + * Load all the remaining security modules. */ do_security_initcalls(); diff --git a/security/yama/Kconfig b/security/yama/Kconfig index 3123e1da2fedb0..90c605eea89215 100644 --- a/security/yama/Kconfig +++ b/security/yama/Kconfig @@ -6,14 +6,7 @@ config SECURITY_YAMA This selects Yama, which extends DAC support with additional system-wide security settings beyond regular Linux discretionary access controls. Currently available is ptrace scope restriction. + Like capabilities, this security module stacks with other LSMs. Further information can be found in Documentation/security/Yama.txt. If you are unsure how to answer this question, answer N. - -config SECURITY_YAMA_STACKED - bool "Yama stacked with other LSMs" - depends on SECURITY_YAMA - default n - help - When Yama is built into the kernel, force it to stack with the - selected primary LSM. diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 9ed32502470e9b..d3c19c970a06bf 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -353,11 +353,6 @@ static struct security_hook_list yama_hooks[] = { LSM_HOOK_INIT(task_free, yama_task_free), }; -void __init yama_add_hooks(void) -{ - security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks)); -} - #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -396,25 +391,18 @@ static struct ctl_table yama_sysctl_table[] = { }, { } }; -#endif /* CONFIG_SYSCTL */ - -static __init int yama_init(void) +static void __init yama_init_sysctl(void) { -#ifndef CONFIG_SECURITY_YAMA_STACKED - /* - * If yama is being stacked this is already taken care of. - */ - if (!security_module_enable("yama")) - return 0; -#endif - pr_info("Yama: becoming mindful.\n"); - -#ifdef CONFIG_SYSCTL if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); -#endif - - return 0; } +#else +static inline void yama_init_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ -security_initcall(yama_init); +void __init yama_add_hooks(void) +{ + pr_info("Yama: becoming mindful.\n"); + security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks)); + yama_init_sysctl(); +} From 21abb1ec414c75abe32c3854848ff30e2b4a6113 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 22 Jul 2015 14:25:31 -0700 Subject: [PATCH 063/734] Smack: IPv6 host labeling IPv6 appears to be (finally) coming of age with the influx of autonomous devices. In support of this, add the ability to associate a Smack label with IPv6 addresses. This patch also cleans up some of the conditional compilation associated with the introduction of secmark processing. It's now more obvious which bit of code goes with which feature. Signed-off-by: Casey Schaufler --- Documentation/security/Smack.txt | 27 +- security/smack/smack.h | 48 +++- security/smack/smack_lsm.c | 262 +++++++++++++------ security/smack/smackfs.c | 428 +++++++++++++++++++++++++------ 4 files changed, 604 insertions(+), 161 deletions(-) diff --git a/Documentation/security/Smack.txt b/Documentation/security/Smack.txt index de5e1aeca7fb95..5e6d07fbed07c5 100644 --- a/Documentation/security/Smack.txt +++ b/Documentation/security/Smack.txt @@ -28,6 +28,10 @@ Smack kernels use the CIPSO IP option. Some network configurations are intolerant of IP options and can impede access to systems that use them as Smack does. +Smack is used in the Tizen operating system. Please +go to http://wiki.tizen.org for information about how +Smack is used in Tizen. + The current git repository for Smack user space is: git://github.com/smack-team/smack.git @@ -108,6 +112,8 @@ in the smackfs filesystem. This pseudo-filesystem is mounted on /sys/fs/smackfs. access + Provided for backward compatibility. The access2 interface + is preferred and should be used instead. This interface reports whether a subject with the specified Smack label has a particular access to an object with a specified Smack label. Write a fixed format access rule to @@ -136,6 +142,8 @@ change-rule those in the fourth string. If there is no such rule it will be created using the access specified in the third and the fourth strings. cipso + Provided for backward compatibility. The cipso2 interface + is preferred and should be used instead. This interface allows a specific CIPSO header to be assigned to a Smack label. The format accepted on write is: "%24s%4d%4d"["%4d"]... @@ -157,7 +165,19 @@ direct doi This contains the CIPSO domain of interpretation used in network packets. +ipv6host + This interface allows specific IPv6 internet addresses to be + treated as single label hosts. Packets are sent to single + label hosts only from processes that have Smack write access + to the host label. All packets received from single label hosts + are given the specified label. The format accepted on write is: + "%h:%h:%h:%h:%h:%h:%h:%h label" or + "%h:%h:%h:%h:%h:%h:%h:%h/%d label". + The "::" address shortcut is not supported. + If label is "-DELETE" a matched entry will be deleted. load + Provided for backward compatibility. The load2 interface + is preferred and should be used instead. This interface allows access control rules in addition to the system defined rules to be specified. The format accepted on write is: @@ -181,6 +201,8 @@ load2 permissions that are not allowed. The string "r-x--" would specify read and execute access. load-self + Provided for backward compatibility. The load-self2 interface + is preferred and should be used instead. This interface allows process specific access rules to be defined. These rules are only consulted if access would otherwise be permitted, and are intended to provide additional @@ -205,6 +227,8 @@ netlabel received from single label hosts are given the specified label. The format accepted on write is: "%d.%d.%d.%d label" or "%d.%d.%d.%d/%d label". + If the label specified is "-CIPSO" the address is treated + as a host that supports CIPSO headers. onlycap This contains labels processes must have for CAP_MAC_ADMIN and CAP_MAC_OVERRIDE to be effective. If this file is empty @@ -232,7 +256,8 @@ unconfined is dangerous and can ruin the proper labeling of your system. It should never be used in production. -You can add access rules in /etc/smack/accesses. They take the form: +If you are using the smackload utility +you can add access rules in /etc/smack/accesses. They take the form: subjectlabel objectlabel access diff --git a/security/smack/smack.h b/security/smack/smack.h index 69ab9eb7d6d927..fff0c612bbb77b 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -17,11 +17,26 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif /* CONFIG_IPV6 */ #include #include #include #include +/* + * Use IPv6 port labeling if IPv6 is enabled and secmarks + * are not being used. + */ +#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#define SMACK_IPV6_PORT_LABELING 1 +#endif + +#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_SECURITY_SMACK_NETFILTER) +#define SMACK_IPV6_SECMARK_LABELING 1 +#endif + /* * Smack labels were limited to 23 characters for a long time. */ @@ -118,15 +133,30 @@ struct smack_rule { }; /* - * An entry in the table identifying hosts. + * An entry in the table identifying IPv4 hosts. */ -struct smk_netlbladdr { +struct smk_net4addr { struct list_head list; - struct sockaddr_in smk_host; /* network address */ + struct in_addr smk_host; /* network address */ struct in_addr smk_mask; /* network mask */ + int smk_masks; /* mask size */ + struct smack_known *smk_label; /* label */ +}; + +#if IS_ENABLED(CONFIG_IPV6) +/* + * An entry in the table identifying IPv6 hosts. + */ +struct smk_net6addr { + struct list_head list; + struct in6_addr smk_host; /* network address */ + struct in6_addr smk_mask; /* network mask */ + int smk_masks; /* mask size */ struct smack_known *smk_label; /* label */ }; +#endif /* CONFIG_IPV6 */ +#ifdef SMACK_IPV6_PORT_LABELING /* * An entry in the table identifying ports. */ @@ -137,6 +167,7 @@ struct smk_port_label { struct smack_known *smk_in; /* inbound label */ struct smack_known *smk_out; /* outgoing label */ }; +#endif /* SMACK_IPV6_PORT_LABELING */ struct smack_onlycap { struct list_head list; @@ -170,6 +201,7 @@ enum { #define SMK_FSROOT "smackfsroot=" #define SMK_FSTRANS "smackfstransmute=" +#define SMACK_DELETE_OPTION "-DELETE" #define SMACK_CIPSO_OPTION "-CIPSO" /* @@ -252,10 +284,6 @@ struct smk_audit_info { struct smack_audit_data sad; #endif }; -/* - * These functions are in smack_lsm.c - */ -struct inode_smack *new_inode_smack(struct smack_known *); /* * These functions are in smack_access.c @@ -285,7 +313,6 @@ extern struct smack_known *smack_syslog_label; #ifdef CONFIG_SECURITY_SMACK_BRINGUP extern struct smack_known *smack_unconfined; #endif -extern struct smack_known smack_cipso_option; extern int smack_ptrace_rule; extern struct smack_known smack_known_floor; @@ -297,7 +324,10 @@ extern struct smack_known smack_known_web; extern struct mutex smack_known_lock; extern struct list_head smack_known_list; -extern struct list_head smk_netlbladdr_list; +extern struct list_head smk_net4addr_list; +#if IS_ENABLED(CONFIG_IPV6) +extern struct list_head smk_net6addr_list; +#endif /* CONFIG_IPV6 */ extern struct mutex smack_onlycap_lock; extern struct list_head smack_onlycap_list; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d962f887d3f445..cc390bccecd775 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -51,9 +51,9 @@ #define SMK_RECEIVING 1 #define SMK_SENDING 2 -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_PORT_LABELING LIST_HEAD(smk_ipv6_port_list); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif static struct kmem_cache *smack_inode_cache; int smack_enabled; @@ -2272,7 +2272,7 @@ static void smack_sk_free_security(struct sock *sk) } /** -* smack_host_label - check host based restrictions +* smack_ipv4host_label - check host based restrictions * @sip: the object end * * looks for host based access restrictions @@ -2283,30 +2283,96 @@ static void smack_sk_free_security(struct sock *sk) * * Returns the label of the far end or NULL if it's not special. */ -static struct smack_known *smack_host_label(struct sockaddr_in *sip) +static struct smack_known *smack_ipv4host_label(struct sockaddr_in *sip) { - struct smk_netlbladdr *snp; + struct smk_net4addr *snp; struct in_addr *siap = &sip->sin_addr; if (siap->s_addr == 0) return NULL; - list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list) + list_for_each_entry_rcu(snp, &smk_net4addr_list, list) + /* + * we break after finding the first match because + * the list is sorted from longest to shortest mask + * so we have found the most specific match + */ + if (snp->smk_host.s_addr == + (siap->s_addr & snp->smk_mask.s_addr)) + return snp->smk_label; + + return NULL; +} + +#if IS_ENABLED(CONFIG_IPV6) +/* + * smk_ipv6_localhost - Check for local ipv6 host address + * @sip: the address + * + * Returns boolean true if this is the localhost address + */ +static bool smk_ipv6_localhost(struct sockaddr_in6 *sip) +{ + __be16 *be16p = (__be16 *)&sip->sin6_addr; + __be32 *be32p = (__be32 *)&sip->sin6_addr; + + if (be32p[0] == 0 && be32p[1] == 0 && be32p[2] == 0 && be16p[6] == 0 && + ntohs(be16p[7]) == 1) + return true; + return false; +} + +/** +* smack_ipv6host_label - check host based restrictions +* @sip: the object end +* +* looks for host based access restrictions +* +* This version will only be appropriate for really small sets of single label +* hosts. The caller is responsible for ensuring that the RCU read lock is +* taken before calling this function. +* +* Returns the label of the far end or NULL if it's not special. +*/ +static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip) +{ + struct smk_net6addr *snp; + struct in6_addr *sap = &sip->sin6_addr; + int i; + int found = 0; + + /* + * It's local. Don't look for a host label. + */ + if (smk_ipv6_localhost(sip)) + return NULL; + + list_for_each_entry_rcu(snp, &smk_net6addr_list, list) { /* * we break after finding the first match because * the list is sorted from longest to shortest mask * so we have found the most specific match */ - if ((&snp->smk_host.sin_addr)->s_addr == - (siap->s_addr & (&snp->smk_mask)->s_addr)) { - /* we have found the special CIPSO option */ - if (snp->smk_label == &smack_cipso_option) - return NULL; - return snp->smk_label; + for (found = 1, i = 0; i < 8; i++) { + /* + * If the label is NULL the entry has + * been renounced. Ignore it. + */ + if (snp->smk_label == NULL) + continue; + if ((sap->s6_addr16[i] & snp->smk_mask.s6_addr16[i]) != + snp->smk_host.s6_addr16[i]) { + found = 0; + break; + } } + if (found) + return snp->smk_label; + } return NULL; } +#endif /* CONFIG_IPV6 */ /** * smack_netlabel - Set the secattr on a socket @@ -2370,7 +2436,7 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) struct smk_audit_info ad; rcu_read_lock(); - hkp = smack_host_label(sap); + hkp = smack_ipv4host_label(sap); if (hkp != NULL) { #ifdef CONFIG_AUDIT struct lsm_network_audit net; @@ -2395,7 +2461,42 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) return smack_netlabel(sk, sk_lbl); } -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#if IS_ENABLED(CONFIG_IPV6) +/** + * smk_ipv6_check - check Smack access + * @subject: subject Smack label + * @object: object Smack label + * @address: address + * @act: the action being taken + * + * Check an IPv6 access + */ +static int smk_ipv6_check(struct smack_known *subject, + struct smack_known *object, + struct sockaddr_in6 *address, int act) +{ +#ifdef CONFIG_AUDIT + struct lsm_network_audit net; +#endif + struct smk_audit_info ad; + int rc; + +#ifdef CONFIG_AUDIT + smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); + ad.a.u.net->family = PF_INET6; + ad.a.u.net->dport = ntohs(address->sin6_port); + if (act == SMK_RECEIVING) + ad.a.u.net->v6info.saddr = address->sin6_addr; + else + ad.a.u.net->v6info.daddr = address->sin6_addr; +#endif + rc = smk_access(subject, object, MAY_WRITE, &ad); + rc = smk_bu_note("IPv6 check", subject, object, MAY_WRITE, rc); + return rc; +} +#endif /* CONFIG_IPV6 */ + +#ifdef SMACK_IPV6_PORT_LABELING /** * smk_ipv6_port_label - Smack port access table management * @sock: socket @@ -2479,48 +2580,43 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, int act) { - __be16 *bep; - __be32 *be32p; struct smk_port_label *spp; struct socket_smack *ssp = sk->sk_security; - struct smack_known *skp; - unsigned short port = 0; + struct smack_known *skp = NULL; + unsigned short port; struct smack_known *object; - struct smk_audit_info ad; - int rc; -#ifdef CONFIG_AUDIT - struct lsm_network_audit net; -#endif if (act == SMK_RECEIVING) { - skp = smack_net_ambient; + skp = smack_ipv6host_label(address); object = ssp->smk_in; } else { skp = ssp->smk_out; - object = smack_net_ambient; + object = smack_ipv6host_label(address); } /* - * Get the IP address and port from the address. + * The other end is a single label host. */ - port = ntohs(address->sin6_port); - bep = (__be16 *)(&address->sin6_addr); - be32p = (__be32 *)(&address->sin6_addr); + if (skp != NULL && object != NULL) + return smk_ipv6_check(skp, object, address, act); + if (skp == NULL) + skp = smack_net_ambient; + if (object == NULL) + object = smack_net_ambient; /* * It's remote, so port lookup does no good. */ - if (be32p[0] || be32p[1] || be32p[2] || bep[6] || ntohs(bep[7]) != 1) - goto auditout; + if (!smk_ipv6_localhost(address)) + return smk_ipv6_check(skp, object, address, act); /* * It's local so the send check has to have passed. */ - if (act == SMK_RECEIVING) { - skp = &smack_known_web; - goto auditout; - } + if (act == SMK_RECEIVING) + return 0; + port = ntohs(address->sin6_port); list_for_each_entry(spp, &smk_ipv6_port_list, list) { if (spp->smk_port != port) continue; @@ -2530,22 +2626,9 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, break; } -auditout: - -#ifdef CONFIG_AUDIT - smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); - ad.a.u.net->family = sk->sk_family; - ad.a.u.net->dport = port; - if (act == SMK_RECEIVING) - ad.a.u.net->v6info.saddr = address->sin6_addr; - else - ad.a.u.net->v6info.daddr = address->sin6_addr; -#endif - rc = smk_access(skp, object, MAY_WRITE, &ad); - rc = smk_bu_note("IPv6 port check", skp, object, MAY_WRITE, rc); - return rc; + return smk_ipv6_check(skp, object, address, act); } -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ /** * smack_inode_setsecurity - set smack xattrs @@ -2606,10 +2689,10 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name, } else return -EOPNOTSUPP; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_PORT_LABELING if (sock->sk->sk_family == PF_INET6) smk_ipv6_port_label(sock, NULL); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif return 0; } @@ -2651,7 +2734,7 @@ static int smack_socket_post_create(struct socket *sock, int family, return smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET); } -#ifndef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_PORT_LABELING /** * smack_socket_bind - record port binding information. * @sock: the socket @@ -2665,14 +2748,11 @@ static int smack_socket_post_create(struct socket *sock, int family, static int smack_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { -#if IS_ENABLED(CONFIG_IPV6) if (sock->sk != NULL && sock->sk->sk_family == PF_INET6) smk_ipv6_port_label(sock, address); -#endif - return 0; } -#endif /* !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ /** * smack_socket_connect - connect access check @@ -2688,6 +2768,13 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, int addrlen) { int rc = 0; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *sip = (struct sockaddr_in6 *)sap; +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + struct smack_known *rsp; + struct socket_smack *ssp = sock->sk->sk_security; +#endif if (sock->sk == NULL) return 0; @@ -2701,10 +2788,15 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, case PF_INET6: if (addrlen < sizeof(struct sockaddr_in6)) return -EINVAL; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) - rc = smk_ipv6_port_check(sock->sk, (struct sockaddr_in6 *)sap, +#ifdef SMACK_IPV6_SECMARK_LABELING + rsp = smack_ipv6host_label(sip); + if (rsp != NULL) + rc = smk_ipv6_check(ssp->smk_out, rsp, sip, SMK_CONNECTING); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif +#ifdef SMACK_IPV6_PORT_LABELING + rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#endif break; } return rc; @@ -3590,9 +3682,13 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { struct sockaddr_in *sip = (struct sockaddr_in *) msg->msg_name; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sap = (struct sockaddr_in6 *) msg->msg_name; -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + struct socket_smack *ssp = sock->sk->sk_security; + struct smack_known *rsp; +#endif int rc = 0; /* @@ -3606,9 +3702,15 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, rc = smack_netlabel_send(sock->sk, sip); break; case AF_INET6: -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_SECMARK_LABELING + rsp = smack_ipv6host_label(sap); + if (rsp != NULL) + rc = smk_ipv6_check(ssp->smk_out, rsp, sap, + SMK_CONNECTING); +#endif +#ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sock->sk, sap, SMK_SENDING); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif break; } return rc; @@ -3822,10 +3924,12 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_TCP) break; -#ifdef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_SECMARK_LABELING if (skb && skb->secmark != 0) skp = smack_from_secid(skb->secmark); else + skp = smack_ipv6host_label(&sadd); + if (skp == NULL) skp = smack_net_ambient; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); @@ -3836,9 +3940,10 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("IPv6 delivery", skp, ssp->smk_in, MAY_WRITE, rc); -#else /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_SECMARK_LABELING */ +#ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sk, &sadd, SMK_RECEIVING); -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ break; #endif /* CONFIG_IPV6 */ } @@ -3936,13 +4041,11 @@ static int smack_socket_getpeersec_dgram(struct socket *sock, } netlbl_secattr_destroy(&secattr); break; -#if IS_ENABLED(CONFIG_IPV6) case PF_INET6: -#ifdef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_SECMARK_LABELING s = skb->secmark; -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif break; -#endif /* CONFIG_IPV6 */ } *secid = s; if (s == 0) @@ -4065,7 +4168,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, hdr = ip_hdr(skb); addr.sin_addr.s_addr = hdr->saddr; rcu_read_lock(); - hskp = smack_host_label(&addr); + hskp = smack_ipv4host_label(&addr); rcu_read_unlock(); if (hskp == NULL) @@ -4517,9 +4620,9 @@ struct security_hook_list smack_hooks[] = { LSM_HOOK_INIT(unix_may_send, smack_unix_may_send), LSM_HOOK_INIT(socket_post_create, smack_socket_post_create), -#ifndef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_PORT_LABELING LSM_HOOK_INIT(socket_bind, smack_socket_bind), -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif LSM_HOOK_INIT(socket_connect, smack_socket_connect), LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg), LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb), @@ -4614,7 +4717,16 @@ static __init int smack_init(void) return -ENOMEM; } - printk(KERN_INFO "Smack: Initializing.\n"); + pr_info("Smack: Initializing.\n"); +#ifdef CONFIG_SECURITY_SMACK_NETFILTER + pr_info("Smack: Netfilter enabled.\n"); +#endif +#ifdef SMACK_IPV6_PORT_LABELING + pr_info("Smack: IPv6 port labeling enabled.\n"); +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + pr_info("Smack: IPv6 Netfilter enabled.\n"); +#endif /* * Set the security state for the initial task. diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 81a2888a990863..11b752b366eabe 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -29,6 +29,7 @@ #include #include "smack.h" +#define BEBITS (sizeof(__be32) * 8) /* * smackfs pseudo filesystem. */ @@ -40,7 +41,7 @@ enum smk_inos { SMK_DOI = 5, /* CIPSO DOI */ SMK_DIRECT = 6, /* CIPSO level indicating direct label */ SMK_AMBIENT = 7, /* internet ambient label */ - SMK_NETLBLADDR = 8, /* single label hosts */ + SMK_NET4ADDR = 8, /* single label hosts */ SMK_ONLYCAP = 9, /* the only "capable" label */ SMK_LOGGING = 10, /* logging */ SMK_LOAD_SELF = 11, /* task specific rules */ @@ -57,6 +58,9 @@ enum smk_inos { #ifdef CONFIG_SECURITY_SMACK_BRINGUP SMK_UNCONFINED = 22, /* define an unconfined label */ #endif +#if IS_ENABLED(CONFIG_IPV6) + SMK_NET6ADDR = 23, /* single label IPv6 hosts */ +#endif /* CONFIG_IPV6 */ }; /* @@ -64,7 +68,10 @@ enum smk_inos { */ static DEFINE_MUTEX(smack_cipso_lock); static DEFINE_MUTEX(smack_ambient_lock); -static DEFINE_MUTEX(smk_netlbladdr_lock); +static DEFINE_MUTEX(smk_net4addr_lock); +#if IS_ENABLED(CONFIG_IPV6) +static DEFINE_MUTEX(smk_net6addr_lock); +#endif /* CONFIG_IPV6 */ /* * This is the "ambient" label for network traffic. @@ -118,7 +125,10 @@ int smack_ptrace_rule = SMACK_PTRACE_DEFAULT; * can write to the specified label. */ -LIST_HEAD(smk_netlbladdr_list); +LIST_HEAD(smk_net4addr_list); +#if IS_ENABLED(CONFIG_IPV6) +LIST_HEAD(smk_net6addr_list); +#endif /* CONFIG_IPV6 */ /* * Rule lists are maintained for each label. @@ -140,11 +150,6 @@ struct smack_parsed_rule { static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT; -struct smack_known smack_cipso_option = { - .smk_known = SMACK_CIPSO_OPTION, - .smk_secid = 0, -}; - /* * Values for parsing cipso rules * SMK_DIGITLEN: Length of a digit field in a rule. @@ -1047,92 +1052,90 @@ static const struct file_operations smk_cipso2_ops = { * Seq_file read operations for /smack/netlabel */ -static void *netlbladdr_seq_start(struct seq_file *s, loff_t *pos) +static void *net4addr_seq_start(struct seq_file *s, loff_t *pos) { - return smk_seq_start(s, pos, &smk_netlbladdr_list); + return smk_seq_start(s, pos, &smk_net4addr_list); } -static void *netlbladdr_seq_next(struct seq_file *s, void *v, loff_t *pos) +static void *net4addr_seq_next(struct seq_file *s, void *v, loff_t *pos) { - return smk_seq_next(s, v, pos, &smk_netlbladdr_list); + return smk_seq_next(s, v, pos, &smk_net4addr_list); } -#define BEBITS (sizeof(__be32) * 8) /* * Print host/label pairs */ -static int netlbladdr_seq_show(struct seq_file *s, void *v) +static int net4addr_seq_show(struct seq_file *s, void *v) { struct list_head *list = v; - struct smk_netlbladdr *skp = - list_entry_rcu(list, struct smk_netlbladdr, list); - unsigned char *hp = (char *) &skp->smk_host.sin_addr.s_addr; - int maskn; - u32 temp_mask = be32_to_cpu(skp->smk_mask.s_addr); - - for (maskn = 0; temp_mask; temp_mask <<= 1, maskn++); + struct smk_net4addr *skp = + list_entry_rcu(list, struct smk_net4addr, list); + char *kp = SMACK_CIPSO_OPTION; - seq_printf(s, "%u.%u.%u.%u/%d %s\n", - hp[0], hp[1], hp[2], hp[3], maskn, skp->smk_label->smk_known); + if (skp->smk_label != NULL) + kp = skp->smk_label->smk_known; + seq_printf(s, "%pI4/%d %s\n", &skp->smk_host.s_addr, + skp->smk_masks, kp); return 0; } -static const struct seq_operations netlbladdr_seq_ops = { - .start = netlbladdr_seq_start, - .next = netlbladdr_seq_next, - .show = netlbladdr_seq_show, +static const struct seq_operations net4addr_seq_ops = { + .start = net4addr_seq_start, + .next = net4addr_seq_next, + .show = net4addr_seq_show, .stop = smk_seq_stop, }; /** - * smk_open_netlbladdr - open() for /smack/netlabel + * smk_open_net4addr - open() for /smack/netlabel * @inode: inode structure representing file * @file: "netlabel" file pointer * - * Connect our netlbladdr_seq_* operations with /smack/netlabel + * Connect our net4addr_seq_* operations with /smack/netlabel * file_operations */ -static int smk_open_netlbladdr(struct inode *inode, struct file *file) +static int smk_open_net4addr(struct inode *inode, struct file *file) { - return seq_open(file, &netlbladdr_seq_ops); + return seq_open(file, &net4addr_seq_ops); } /** - * smk_netlbladdr_insert + * smk_net4addr_insert * @new : netlabel to insert * - * This helper insert netlabel in the smack_netlbladdrs list + * This helper insert netlabel in the smack_net4addrs list * sorted by netmask length (longest to smallest) - * locked by &smk_netlbladdr_lock in smk_write_netlbladdr + * locked by &smk_net4addr_lock in smk_write_net4addr * */ -static void smk_netlbladdr_insert(struct smk_netlbladdr *new) +static void smk_net4addr_insert(struct smk_net4addr *new) { - struct smk_netlbladdr *m, *m_next; + struct smk_net4addr *m; + struct smk_net4addr *m_next; - if (list_empty(&smk_netlbladdr_list)) { - list_add_rcu(&new->list, &smk_netlbladdr_list); + if (list_empty(&smk_net4addr_list)) { + list_add_rcu(&new->list, &smk_net4addr_list); return; } - m = list_entry_rcu(smk_netlbladdr_list.next, - struct smk_netlbladdr, list); + m = list_entry_rcu(smk_net4addr_list.next, + struct smk_net4addr, list); /* the comparison '>' is a bit hacky, but works */ - if (new->smk_mask.s_addr > m->smk_mask.s_addr) { - list_add_rcu(&new->list, &smk_netlbladdr_list); + if (new->smk_masks > m->smk_masks) { + list_add_rcu(&new->list, &smk_net4addr_list); return; } - list_for_each_entry_rcu(m, &smk_netlbladdr_list, list) { - if (list_is_last(&m->list, &smk_netlbladdr_list)) { + list_for_each_entry_rcu(m, &smk_net4addr_list, list) { + if (list_is_last(&m->list, &smk_net4addr_list)) { list_add_rcu(&new->list, &m->list); return; } m_next = list_entry_rcu(m->list.next, - struct smk_netlbladdr, list); - if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) { + struct smk_net4addr, list); + if (new->smk_masks > m_next->smk_masks) { list_add_rcu(&new->list, &m->list); return; } @@ -1141,28 +1144,29 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) /** - * smk_write_netlbladdr - write() for /smack/netlabel + * smk_write_net4addr - write() for /smack/netlabel * @file: file pointer, not actually used * @buf: where to get the data from * @count: bytes sent * @ppos: where to start * - * Accepts only one netlbladdr per write call. + * Accepts only one net4addr per write call. * Returns number of bytes written or error code, as appropriate */ -static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, +static ssize_t smk_write_net4addr(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct smk_netlbladdr *snp; + struct smk_net4addr *snp; struct sockaddr_in newname; char *smack; - struct smack_known *skp; + struct smack_known *skp = NULL; char *data; char *host = (char *)&newname.sin_addr.s_addr; int rc; struct netlbl_audit audit_info; struct in_addr mask; unsigned int m; + unsigned int masks; int found; u32 mask_bits = (1<<31); __be32 nsa; @@ -1200,7 +1204,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, data[count] = '\0'; rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd/%u %s", - &host[0], &host[1], &host[2], &host[3], &m, smack); + &host[0], &host[1], &host[2], &host[3], &masks, smack); if (rc != 6) { rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd %s", &host[0], &host[1], &host[2], &host[3], smack); @@ -1209,8 +1213,9 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, goto free_out; } m = BEBITS; + masks = 32; } - if (m > BEBITS) { + if (masks > BEBITS) { rc = -EINVAL; goto free_out; } @@ -1225,16 +1230,16 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, goto free_out; } } else { - /* check known options */ - if (strcmp(smack, smack_cipso_option.smk_known) == 0) - skp = &smack_cipso_option; - else { + /* + * Only the -CIPSO option is supported for IPv4 + */ + if (strcmp(smack, SMACK_CIPSO_OPTION) != 0) { rc = -EINVAL; goto free_out; } } - for (temp_mask = 0; m > 0; m--) { + for (m = masks, temp_mask = 0; m > 0; m--) { temp_mask |= mask_bits; mask_bits >>= 1; } @@ -1245,14 +1250,13 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, * Only allow one writer at a time. Writes should be * quite rare and small in any case. */ - mutex_lock(&smk_netlbladdr_lock); + mutex_lock(&smk_net4addr_lock); nsa = newname.sin_addr.s_addr; /* try to find if the prefix is already in the list */ found = 0; - list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list) { - if (snp->smk_host.sin_addr.s_addr == nsa && - snp->smk_mask.s_addr == mask.s_addr) { + list_for_each_entry_rcu(snp, &smk_net4addr_list, list) { + if (snp->smk_host.s_addr == nsa && snp->smk_masks == masks) { found = 1; break; } @@ -1265,17 +1269,20 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, rc = -ENOMEM; else { rc = 0; - snp->smk_host.sin_addr.s_addr = newname.sin_addr.s_addr; + snp->smk_host.s_addr = newname.sin_addr.s_addr; snp->smk_mask.s_addr = mask.s_addr; snp->smk_label = skp; - smk_netlbladdr_insert(snp); + snp->smk_masks = masks; + smk_net4addr_insert(snp); } } else { - /* we delete the unlabeled entry, only if the previous label - * wasn't the special CIPSO option */ - if (snp->smk_label != &smack_cipso_option) + /* + * Delete the unlabeled entry, only if the previous label + * wasn't the special CIPSO option + */ + if (snp->smk_label != NULL) rc = netlbl_cfg_unlbl_static_del(&init_net, NULL, - &snp->smk_host.sin_addr, &snp->smk_mask, + &snp->smk_host, &snp->smk_mask, PF_INET, &audit_info); else rc = 0; @@ -1287,15 +1294,279 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, * this host so that incoming packets get labeled. * but only if we didn't get the special CIPSO option */ - if (rc == 0 && skp != &smack_cipso_option) + if (rc == 0 && skp != NULL) rc = netlbl_cfg_unlbl_static_add(&init_net, NULL, - &snp->smk_host.sin_addr, &snp->smk_mask, PF_INET, + &snp->smk_host, &snp->smk_mask, PF_INET, snp->smk_label->smk_secid, &audit_info); if (rc == 0) rc = count; - mutex_unlock(&smk_netlbladdr_lock); + mutex_unlock(&smk_net4addr_lock); + +free_out: + kfree(smack); +free_data_out: + kfree(data); + + return rc; +} + +static const struct file_operations smk_net4addr_ops = { + .open = smk_open_net4addr, + .read = seq_read, + .llseek = seq_lseek, + .write = smk_write_net4addr, + .release = seq_release, +}; + +#if IS_ENABLED(CONFIG_IPV6) +/* + * Seq_file read operations for /smack/netlabel6 + */ + +static void *net6addr_seq_start(struct seq_file *s, loff_t *pos) +{ + return smk_seq_start(s, pos, &smk_net6addr_list); +} + +static void *net6addr_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + return smk_seq_next(s, v, pos, &smk_net6addr_list); +} + +/* + * Print host/label pairs + */ +static int net6addr_seq_show(struct seq_file *s, void *v) +{ + struct list_head *list = v; + struct smk_net6addr *skp = + list_entry(list, struct smk_net6addr, list); + + if (skp->smk_label != NULL) + seq_printf(s, "%pI6/%d %s\n", &skp->smk_host, skp->smk_masks, + skp->smk_label->smk_known); + + return 0; +} + +static const struct seq_operations net6addr_seq_ops = { + .start = net6addr_seq_start, + .next = net6addr_seq_next, + .show = net6addr_seq_show, + .stop = smk_seq_stop, +}; + +/** + * smk_open_net6addr - open() for /smack/netlabel + * @inode: inode structure representing file + * @file: "netlabel" file pointer + * + * Connect our net6addr_seq_* operations with /smack/netlabel + * file_operations + */ +static int smk_open_net6addr(struct inode *inode, struct file *file) +{ + return seq_open(file, &net6addr_seq_ops); +} + +/** + * smk_net6addr_insert + * @new : entry to insert + * + * This inserts an entry in the smack_net6addrs list + * sorted by netmask length (longest to smallest) + * locked by &smk_net6addr_lock in smk_write_net6addr + * + */ +static void smk_net6addr_insert(struct smk_net6addr *new) +{ + struct smk_net6addr *m_next; + struct smk_net6addr *m; + + if (list_empty(&smk_net6addr_list)) { + list_add_rcu(&new->list, &smk_net6addr_list); + return; + } + + m = list_entry_rcu(smk_net6addr_list.next, + struct smk_net6addr, list); + + if (new->smk_masks > m->smk_masks) { + list_add_rcu(&new->list, &smk_net6addr_list); + return; + } + + list_for_each_entry_rcu(m, &smk_net6addr_list, list) { + if (list_is_last(&m->list, &smk_net6addr_list)) { + list_add_rcu(&new->list, &m->list); + return; + } + m_next = list_entry_rcu(m->list.next, + struct smk_net6addr, list); + if (new->smk_masks > m_next->smk_masks) { + list_add_rcu(&new->list, &m->list); + return; + } + } +} + + +/** + * smk_write_net6addr - write() for /smack/netlabel + * @file: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Accepts only one net6addr per write call. + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_net6addr(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct smk_net6addr *snp; + struct in6_addr newname; + struct in6_addr fullmask; + struct smack_known *skp = NULL; + char *smack; + char *data; + int rc = 0; + int found = 0; + int i; + unsigned int scanned[8]; + unsigned int m; + unsigned int mask = 128; + + /* + * Must have privilege. + * No partial writes. + * Enough data must be present. + * "