From eebe3685701bcd4b437f2ddd2142888b1fb39aeb Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 24 Sep 2020 09:40:53 -0400 Subject: [PATCH 01/86] fbdev, newport_con: Move FONT_EXTRA_WORDS macros into linux/font.h commit bb0890b4cd7f8203e3aa99c6d0f062d6acdaad27 upstream. drivers/video/console/newport_con.c is borrowing FONT_EXTRA_WORDS macros from drivers/video/fbdev/core/fbcon.h. To keep things simple, move all definitions into . Since newport_con now uses four extra words, initialize the fourth word in newport_set_font() properly. Cc: stable@vger.kernel.org Signed-off-by: Peilin Ye Reviewed-by: Greg Kroah-Hartman Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/7fb8bc9b0abc676ada6b7ac0e0bd443499357267.1600953813.git.yepeilin.cs@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/video/console/newport_con.c | 7 +------ drivers/video/fbdev/core/fbcon.h | 7 ------- drivers/video/fbdev/core/fbcon_rotate.c | 1 + drivers/video/fbdev/core/tileblit.c | 1 + include/linux/font.h | 8 ++++++++ 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 2d2ee17052e837..f45de374f165f6 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -36,12 +36,6 @@ #define FONT_DATA ((unsigned char *)font_vga_8x16.data) -/* borrowed from fbcon.c */ -#define REFCOUNT(fd) (((int *)(fd))[-1]) -#define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTCHARCNT(fd) (((int *)(fd))[-3]) -#define FONT_EXTRA_WORDS 3 - static unsigned char *font_data[MAX_NR_CONSOLES]; static struct newport_regs *npregs; @@ -523,6 +517,7 @@ static int newport_set_font(int unit, struct console_font *op) FNTSIZE(new_data) = size; FNTCHARCNT(new_data) = op->charcount; REFCOUNT(new_data) = 0; /* usage counter */ + FNTSUM(new_data) = 0; p = new_data; for (i = 0; i < op->charcount; i++) { diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 78bb14c03643ee..9315b360c89813 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -152,13 +152,6 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) #define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) -/* Font */ -#define REFCOUNT(fd) (((int *)(fd))[-1]) -#define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTCHARCNT(fd) (((int *)(fd))[-3]) -#define FNTSUM(fd) (((int *)(fd))[-4]) -#define FONT_EXTRA_WORDS 4 - /* * Scroll Method */ diff --git a/drivers/video/fbdev/core/fbcon_rotate.c b/drivers/video/fbdev/core/fbcon_rotate.c index c0d445294aa7c3..ac72d4f85f7d01 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.c +++ b/drivers/video/fbdev/core/fbcon_rotate.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "fbcon.h" #include "fbcon_rotate.h" diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index eb664dbf96f66e..adff8d6ffe6f9f 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "fbcon.h" diff --git a/include/linux/font.h b/include/linux/font.h index 51b91c8b69d582..40ed008d7dad87 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -59,4 +59,12 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 +/* Extra word getters */ +#define REFCOUNT(fd) (((int *)(fd))[-1]) +#define FNTSIZE(fd) (((int *)(fd))[-2]) +#define FNTCHARCNT(fd) (((int *)(fd))[-3]) +#define FNTSUM(fd) (((int *)(fd))[-4]) + +#define FONT_EXTRA_WORDS 4 + #endif /* _VIDEO_FONT_H */ From f51ec3fd71284531205981858b192970ec46770e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 24 Sep 2020 09:42:22 -0400 Subject: [PATCH 02/86] Fonts: Support FONT_EXTRA_WORDS macros for built-in fonts commit 6735b4632def0640dbdf4eb9f99816aca18c4f16 upstream. syzbot has reported an issue in the framebuffer layer, where a malicious user may overflow our built-in font data buffers. In order to perform a reliable range check, subsystems need to know `FONTDATAMAX` for each built-in font. Unfortunately, our font descriptor, `struct console_font` does not contain `FONTDATAMAX`, and is part of the UAPI, making it infeasible to modify it. For user-provided fonts, the framebuffer layer resolves this issue by reserving four extra words at the beginning of data buffers. Later, whenever a function needs to access them, it simply uses the following macros: Recently we have gathered all the above macros to . Let us do the same thing for built-in fonts, prepend four extra words (including `FONTDATAMAX`) to their data buffers, so that subsystems can use these macros for all fonts, no matter built-in or user-provided. This patch depends on patch "fbdev, newport_con: Move FONT_EXTRA_WORDS macros into linux/font.h". Cc: stable@vger.kernel.org Link: https://syzkaller.appspot.com/bug?id=08b8be45afea11888776f897895aef9ad1c3ecfd Signed-off-by: Peilin Ye Reviewed-by: Greg Kroah-Hartman Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/ef18af00c35fb3cc826048a5f70924ed6ddce95b.1600953813.git.yepeilin.cs@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/font.h | 5 +++++ lib/fonts/font_10x18.c | 9 ++++----- lib/fonts/font_6x10.c | 9 +++++---- lib/fonts/font_6x11.c | 9 ++++----- lib/fonts/font_7x14.c | 9 ++++----- lib/fonts/font_8x16.c | 9 ++++----- lib/fonts/font_8x8.c | 9 ++++----- lib/fonts/font_acorn_8x8.c | 9 ++++++--- lib/fonts/font_mini_4x6.c | 8 ++++---- lib/fonts/font_pearl_8x8.c | 9 ++++----- lib/fonts/font_sun12x22.c | 9 ++++----- lib/fonts/font_sun8x16.c | 7 ++++--- lib/fonts/font_ter16x32.c | 9 ++++----- 13 files changed, 56 insertions(+), 54 deletions(-) diff --git a/include/linux/font.h b/include/linux/font.h index 40ed008d7dad87..59faa80f586df9 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -67,4 +67,9 @@ extern const struct font_desc *get_default_font(int xres, int yres, #define FONT_EXTRA_WORDS 4 +struct font_data { + unsigned int extra[FONT_EXTRA_WORDS]; + const unsigned char data[]; +} __packed; + #endif /* _VIDEO_FONT_H */ diff --git a/lib/fonts/font_10x18.c b/lib/fonts/font_10x18.c index 532f0ff89a962d..0e2deac97da0dc 100644 --- a/lib/fonts/font_10x18.c +++ b/lib/fonts/font_10x18.c @@ -8,8 +8,8 @@ #define FONTDATAMAX 9216 -static const unsigned char fontdata_10x18[FONTDATAMAX] = { - +static struct font_data fontdata_10x18 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, 0x00, /* 0000000000 */ 0x00, 0x00, /* 0000000000 */ @@ -5129,8 +5129,7 @@ static const unsigned char fontdata_10x18[FONTDATAMAX] = { 0x00, 0x00, /* 0000000000 */ 0x00, 0x00, /* 0000000000 */ 0x00, 0x00, /* 0000000000 */ - -}; +} }; const struct font_desc font_10x18 = { @@ -5138,7 +5137,7 @@ const struct font_desc font_10x18 = { .name = "10x18", .width = 10, .height = 18, - .data = fontdata_10x18, + .data = fontdata_10x18.data, #ifdef __sparc__ .pref = 5, #else diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c index 09b2cc03435b93..87da8acd07db0c 100644 --- a/lib/fonts/font_6x10.c +++ b/lib/fonts/font_6x10.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include -static const unsigned char fontdata_6x10[] = { +#define FONTDATAMAX 2560 +static struct font_data fontdata_6x10 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ @@ -3074,14 +3076,13 @@ static const unsigned char fontdata_6x10[] = { 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ - -}; +} }; const struct font_desc font_6x10 = { .idx = FONT6x10_IDX, .name = "6x10", .width = 6, .height = 10, - .data = fontdata_6x10, + .data = fontdata_6x10.data, .pref = 0, }; diff --git a/lib/fonts/font_6x11.c b/lib/fonts/font_6x11.c index d7136c33f1f018..5e975dfa10a53a 100644 --- a/lib/fonts/font_6x11.c +++ b/lib/fonts/font_6x11.c @@ -9,8 +9,8 @@ #define FONTDATAMAX (11*256) -static const unsigned char fontdata_6x11[FONTDATAMAX] = { - +static struct font_data fontdata_6x11 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ @@ -3338,8 +3338,7 @@ static const unsigned char fontdata_6x11[FONTDATAMAX] = { 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ - -}; +} }; const struct font_desc font_vga_6x11 = { @@ -3347,7 +3346,7 @@ const struct font_desc font_vga_6x11 = { .name = "ProFont6x11", .width = 6, .height = 11, - .data = fontdata_6x11, + .data = fontdata_6x11.data, /* Try avoiding this font if possible unless on MAC */ .pref = -2000, }; diff --git a/lib/fonts/font_7x14.c b/lib/fonts/font_7x14.c index 89752d0b23e8b0..86d298f3850588 100644 --- a/lib/fonts/font_7x14.c +++ b/lib/fonts/font_7x14.c @@ -8,8 +8,8 @@ #define FONTDATAMAX 3584 -static const unsigned char fontdata_7x14[FONTDATAMAX] = { - +static struct font_data fontdata_7x14 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 0000000 */ 0x00, /* 0000000 */ @@ -4105,8 +4105,7 @@ static const unsigned char fontdata_7x14[FONTDATAMAX] = { 0x00, /* 0000000 */ 0x00, /* 0000000 */ 0x00, /* 0000000 */ - -}; +} }; const struct font_desc font_7x14 = { @@ -4114,6 +4113,6 @@ const struct font_desc font_7x14 = { .name = "7x14", .width = 7, .height = 14, - .data = fontdata_7x14, + .data = fontdata_7x14.data, .pref = 0, }; diff --git a/lib/fonts/font_8x16.c b/lib/fonts/font_8x16.c index b7ab1f5fbdb8a5..37cedd36ca5ef2 100644 --- a/lib/fonts/font_8x16.c +++ b/lib/fonts/font_8x16.c @@ -10,8 +10,8 @@ #define FONTDATAMAX 4096 -static const unsigned char fontdata_8x16[FONTDATAMAX] = { - +static struct font_data fontdata_8x16 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ @@ -4619,8 +4619,7 @@ static const unsigned char fontdata_8x16[FONTDATAMAX] = { 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ - -}; +} }; const struct font_desc font_vga_8x16 = { @@ -4628,7 +4627,7 @@ const struct font_desc font_vga_8x16 = { .name = "VGA8x16", .width = 8, .height = 16, - .data = fontdata_8x16, + .data = fontdata_8x16.data, .pref = 0, }; EXPORT_SYMBOL(font_vga_8x16); diff --git a/lib/fonts/font_8x8.c b/lib/fonts/font_8x8.c index 2328ebc8bab5d7..8ab695538395df 100644 --- a/lib/fonts/font_8x8.c +++ b/lib/fonts/font_8x8.c @@ -9,8 +9,8 @@ #define FONTDATAMAX 2048 -static const unsigned char fontdata_8x8[FONTDATAMAX] = { - +static struct font_data fontdata_8x8 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ @@ -2570,8 +2570,7 @@ static const unsigned char fontdata_8x8[FONTDATAMAX] = { 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ - -}; +} }; const struct font_desc font_vga_8x8 = { @@ -2579,6 +2578,6 @@ const struct font_desc font_vga_8x8 = { .name = "VGA8x8", .width = 8, .height = 8, - .data = fontdata_8x8, + .data = fontdata_8x8.data, .pref = 0, }; diff --git a/lib/fonts/font_acorn_8x8.c b/lib/fonts/font_acorn_8x8.c index 0ff0e85d4481ba..069b3e80c43449 100644 --- a/lib/fonts/font_acorn_8x8.c +++ b/lib/fonts/font_acorn_8x8.c @@ -3,7 +3,10 @@ #include -static const unsigned char acorndata_8x8[] = { +#define FONTDATAMAX 2048 + +static struct font_data acorndata_8x8 = { +{ 0, 0, FONTDATAMAX, 0 }, { /* 00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^@ */ /* 01 */ 0x7e, 0x81, 0xa5, 0x81, 0xbd, 0x99, 0x81, 0x7e, /* ^A */ /* 02 */ 0x7e, 0xff, 0xbd, 0xff, 0xc3, 0xe7, 0xff, 0x7e, /* ^B */ @@ -260,14 +263,14 @@ static const unsigned char acorndata_8x8[] = { /* FD */ 0x38, 0x04, 0x18, 0x20, 0x3c, 0x00, 0x00, 0x00, /* FE */ 0x00, 0x00, 0x3c, 0x3c, 0x3c, 0x3c, 0x00, 0x00, /* FF */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; +} }; const struct font_desc font_acorn_8x8 = { .idx = ACORN8x8_IDX, .name = "Acorn8x8", .width = 8, .height = 8, - .data = acorndata_8x8, + .data = acorndata_8x8.data, #ifdef CONFIG_ARCH_ACORN .pref = 20, #else diff --git a/lib/fonts/font_mini_4x6.c b/lib/fonts/font_mini_4x6.c index 838caa1cfef70d..1449876c6a2702 100644 --- a/lib/fonts/font_mini_4x6.c +++ b/lib/fonts/font_mini_4x6.c @@ -43,8 +43,8 @@ __END__; #define FONTDATAMAX 1536 -static const unsigned char fontdata_mini_4x6[FONTDATAMAX] = { - +static struct font_data fontdata_mini_4x6 = { + { 0, 0, FONTDATAMAX, 0 }, { /*{*/ /* Char 0: ' ' */ 0xee, /*= [*** ] */ @@ -2145,14 +2145,14 @@ static const unsigned char fontdata_mini_4x6[FONTDATAMAX] = { 0xee, /*= [*** ] */ 0x00, /*= [ ] */ /*}*/ -}; +} }; const struct font_desc font_mini_4x6 = { .idx = MINI4x6_IDX, .name = "MINI4x6", .width = 4, .height = 6, - .data = fontdata_mini_4x6, + .data = fontdata_mini_4x6.data, .pref = 3, }; diff --git a/lib/fonts/font_pearl_8x8.c b/lib/fonts/font_pearl_8x8.c index b15d3c342c5bbc..32d65551e7ed29 100644 --- a/lib/fonts/font_pearl_8x8.c +++ b/lib/fonts/font_pearl_8x8.c @@ -14,8 +14,8 @@ #define FONTDATAMAX 2048 -static const unsigned char fontdata_pearl8x8[FONTDATAMAX] = { - +static struct font_data fontdata_pearl8x8 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ @@ -2575,14 +2575,13 @@ static const unsigned char fontdata_pearl8x8[FONTDATAMAX] = { 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ - -}; +} }; const struct font_desc font_pearl_8x8 = { .idx = PEARL8x8_IDX, .name = "PEARL8x8", .width = 8, .height = 8, - .data = fontdata_pearl8x8, + .data = fontdata_pearl8x8.data, .pref = 2, }; diff --git a/lib/fonts/font_sun12x22.c b/lib/fonts/font_sun12x22.c index 955d6eee3959d7..641a6b4dca424f 100644 --- a/lib/fonts/font_sun12x22.c +++ b/lib/fonts/font_sun12x22.c @@ -3,8 +3,8 @@ #define FONTDATAMAX 11264 -static const unsigned char fontdata_sun12x22[FONTDATAMAX] = { - +static struct font_data fontdata_sun12x22 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, 0x00, /* 000000000000 */ 0x00, 0x00, /* 000000000000 */ @@ -6148,8 +6148,7 @@ static const unsigned char fontdata_sun12x22[FONTDATAMAX] = { 0x00, 0x00, /* 000000000000 */ 0x00, 0x00, /* 000000000000 */ 0x00, 0x00, /* 000000000000 */ - -}; +} }; const struct font_desc font_sun_12x22 = { @@ -6157,7 +6156,7 @@ const struct font_desc font_sun_12x22 = { .name = "SUN12x22", .width = 12, .height = 22, - .data = fontdata_sun12x22, + .data = fontdata_sun12x22.data, #ifdef __sparc__ .pref = 5, #else diff --git a/lib/fonts/font_sun8x16.c b/lib/fonts/font_sun8x16.c index 03d71e53954abd..193fe6d988e08e 100644 --- a/lib/fonts/font_sun8x16.c +++ b/lib/fonts/font_sun8x16.c @@ -3,7 +3,8 @@ #define FONTDATAMAX 4096 -static const unsigned char fontdata_sun8x16[FONTDATAMAX] = { +static struct font_data fontdata_sun8x16 = { +{ 0, 0, FONTDATAMAX, 0 }, { /* */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* */ 0x00,0x00,0x7e,0x81,0xa5,0x81,0x81,0xbd,0x99,0x81,0x81,0x7e,0x00,0x00,0x00,0x00, /* */ 0x00,0x00,0x7e,0xff,0xdb,0xff,0xff,0xc3,0xe7,0xff,0xff,0x7e,0x00,0x00,0x00,0x00, @@ -260,14 +261,14 @@ static const unsigned char fontdata_sun8x16[FONTDATAMAX] = { /* */ 0x00,0x70,0xd8,0x30,0x60,0xc8,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* */ 0x00,0x00,0x00,0x00,0x7c,0x7c,0x7c,0x7c,0x7c,0x7c,0x7c,0x00,0x00,0x00,0x00,0x00, /* */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -}; +} }; const struct font_desc font_sun_8x16 = { .idx = SUN8x16_IDX, .name = "SUN8x16", .width = 8, .height = 16, - .data = fontdata_sun8x16, + .data = fontdata_sun8x16.data, #ifdef __sparc__ .pref = 10, #else diff --git a/lib/fonts/font_ter16x32.c b/lib/fonts/font_ter16x32.c index 3f0cf1ccdf3a42..91b9c283bd9cc5 100644 --- a/lib/fonts/font_ter16x32.c +++ b/lib/fonts/font_ter16x32.c @@ -4,8 +4,8 @@ #define FONTDATAMAX 16384 -static const unsigned char fontdata_ter16x32[FONTDATAMAX] = { - +static struct font_data fontdata_ter16x32 = { + { 0, 0, FONTDATAMAX, 0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xfc, 0x7f, 0xfc, 0x70, 0x1c, 0x70, 0x1c, 0x70, 0x1c, 0x70, 0x1c, @@ -2054,8 +2054,7 @@ static const unsigned char fontdata_ter16x32[FONTDATAMAX] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 255 */ - -}; +} }; const struct font_desc font_ter_16x32 = { @@ -2063,7 +2062,7 @@ const struct font_desc font_ter_16x32 = { .name = "TER16x32", .width = 16, .height = 32, - .data = fontdata_ter16x32, + .data = fontdata_ter16x32.data, #ifdef __sparc__ .pref = 5, #else From 1b2fcd82c0ca23f6fa01298c0d7b59eb4efbaf48 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 24 Sep 2020 09:43:48 -0400 Subject: [PATCH 03/86] fbcon: Fix global-out-of-bounds read in fbcon_get_font() commit 5af08640795b2b9a940c9266c0260455377ae262 upstream. fbcon_get_font() is reading out-of-bounds. A malicious user may resize `vc->vc_font.height` to a large value, causing fbcon_get_font() to read out of `fontdata`. fbcon_get_font() handles both built-in and user-provided fonts. Fortunately, recently we have added FONT_EXTRA_WORDS support for built-in fonts, so fix it by adding range checks using FNTSIZE(). This patch depends on patch "fbdev, newport_con: Move FONT_EXTRA_WORDS macros into linux/font.h", and patch "Fonts: Support FONT_EXTRA_WORDS macros for built-in fonts". Cc: stable@vger.kernel.org Reported-and-tested-by: syzbot+29d4ed7f3bdedf2aa2fd@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=08b8be45afea11888776f897895aef9ad1c3ecfd Signed-off-by: Peilin Ye Reviewed-by: Greg Kroah-Hartman Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/b34544687a1a09d6de630659eb7a773f4953238b.1600953813.git.yepeilin.cs@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/core/fbcon.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index dc7f5c4f0607ef..4cf71ee0965a67 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2292,6 +2292,9 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font) if (font->width <= 8) { j = vc->vc_font.height; + if (font->charcount * j > FNTSIZE(fontdata)) + return -EINVAL; + for (i = 0; i < font->charcount; i++) { memcpy(data, fontdata, j); memset(data + j, 0, 32 - j); @@ -2300,6 +2303,9 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font) } } else if (font->width <= 16) { j = vc->vc_font.height * 2; + if (font->charcount * j > FNTSIZE(fontdata)) + return -EINVAL; + for (i = 0; i < font->charcount; i++) { memcpy(data, fontdata, j); memset(data + j, 0, 64 - j); @@ -2307,6 +2313,9 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font) fontdata += j; } } else if (font->width <= 24) { + if (font->charcount * (vc->vc_font.height * sizeof(u32)) > FNTSIZE(fontdata)) + return -EINVAL; + for (i = 0; i < font->charcount; i++) { for (j = 0; j < vc->vc_font.height; j++) { *data++ = fontdata[0]; @@ -2319,6 +2328,9 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font) } } else { j = vc->vc_font.height * 4; + if (font->charcount * j > FNTSIZE(fontdata)) + return -EINVAL; + for (i = 0; i < font->charcount; i++) { memcpy(data, fontdata, j); memset(data + j, 0, 128 - j); From 4f46ef7bec86d13ac89c7976d8cb41612cd179f2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 22 Sep 2020 09:29:31 +0200 Subject: [PATCH 04/86] Revert "ravb: Fixed to be able to unload modules" commit 77972b55fb9d35d4a6b0abca99abffaa4ec6a85b upstream. This reverts commit 1838d6c62f57836639bd3d83e7855e0ee4f6defc. This commit moved the ravb_mdio_init() call (and thus the of_mdiobus_register() call) from the ravb_probe() to the ravb_open() call. This causes a regression during system resume (s2idle/s2ram), as new PHY devices cannot be bound while suspended. During boot, the Micrel PHY is detected like this: Micrel KSZ9031 Gigabit PHY e6800000.ethernet-ffffffff:00: attached PHY driver [Micrel KSZ9031 Gigabit PHY] (mii_bus:phy_addr=e6800000.ethernet-ffffffff:00, irq=228) ravb e6800000.ethernet eth0: Link is Up - 1Gbps/Full - flow control off During system suspend, (A) defer_all_probes is set to true, and (B) usermodehelper_disabled is set to UMH_DISABLED, to avoid drivers being probed while suspended. A. If CONFIG_MODULES=n, phy_device_register() calling device_add() merely adds the device, but does not probe it yet, as really_probe() returns early due to defer_all_probes being set: dpm_resume+0x128/0x4f8 device_resume+0xcc/0x1b0 dpm_run_callback+0x74/0x340 ravb_resume+0x190/0x1b8 ravb_open+0x84/0x770 of_mdiobus_register+0x1e0/0x468 of_mdiobus_register_phy+0x1b8/0x250 of_mdiobus_phy_device_register+0x178/0x1e8 phy_device_register+0x114/0x1b8 device_add+0x3d4/0x798 bus_probe_device+0x98/0xa0 device_initial_probe+0x10/0x18 __device_attach+0xe4/0x140 bus_for_each_drv+0x64/0xc8 __device_attach_driver+0xb8/0xe0 driver_probe_device.part.11+0xc4/0xd8 really_probe+0x32c/0x3b8 Later, phy_attach_direct() notices no PHY driver has been bound, and falls back to the Generic PHY, leading to degraded operation: Generic PHY e6800000.ethernet-ffffffff:00: attached PHY driver [Generic PHY] (mii_bus:phy_addr=e6800000.ethernet-ffffffff:00, irq=POLL) ravb e6800000.ethernet eth0: Link is Up - 1Gbps/Full - flow control off B. If CONFIG_MODULES=y, request_module() returns early with -EBUSY due to UMH_DISABLED, and MDIO initialization fails completely: mdio_bus e6800000.ethernet-ffffffff:00: error -16 loading PHY driver module for ID 0x00221622 ravb e6800000.ethernet eth0: failed to initialize MDIO PM: dpm_run_callback(): ravb_resume+0x0/0x1b8 returns -16 PM: Device e6800000.ethernet failed to resume: error -16 Ignoring -EBUSY in phy_request_driver_module(), like was done for -ENOENT in commit 21e194425abd65b5 ("net: phy: fix issue with loading PHY driver w/o initramfs"), would makes it fall back to the Generic PHY, like in the CONFIG_MODULES=n case. Signed-off-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/renesas/ravb_main.c | 110 +++++++++++------------ 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 907ae1359a7c1f..30cdabf64ccc1a 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1336,51 +1336,6 @@ static inline int ravb_hook_irq(unsigned int irq, irq_handler_t handler, return error; } -/* MDIO bus init function */ -static int ravb_mdio_init(struct ravb_private *priv) -{ - struct platform_device *pdev = priv->pdev; - struct device *dev = &pdev->dev; - int error; - - /* Bitbang init */ - priv->mdiobb.ops = &bb_ops; - - /* MII controller setting */ - priv->mii_bus = alloc_mdio_bitbang(&priv->mdiobb); - if (!priv->mii_bus) - return -ENOMEM; - - /* Hook up MII support for ethtool */ - priv->mii_bus->name = "ravb_mii"; - priv->mii_bus->parent = dev; - snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x", - pdev->name, pdev->id); - - /* Register MDIO bus */ - error = of_mdiobus_register(priv->mii_bus, dev->of_node); - if (error) - goto out_free_bus; - - return 0; - -out_free_bus: - free_mdio_bitbang(priv->mii_bus); - return error; -} - -/* MDIO bus release function */ -static int ravb_mdio_release(struct ravb_private *priv) -{ - /* Unregister mdio bus */ - mdiobus_unregister(priv->mii_bus); - - /* Free bitbang info */ - free_mdio_bitbang(priv->mii_bus); - - return 0; -} - /* Network device open function for Ethernet AVB */ static int ravb_open(struct net_device *ndev) { @@ -1389,13 +1344,6 @@ static int ravb_open(struct net_device *ndev) struct device *dev = &pdev->dev; int error; - /* MDIO bus init */ - error = ravb_mdio_init(priv); - if (error) { - netdev_err(ndev, "failed to initialize MDIO\n"); - return error; - } - napi_enable(&priv->napi[RAVB_BE]); napi_enable(&priv->napi[RAVB_NC]); @@ -1473,7 +1421,6 @@ static int ravb_open(struct net_device *ndev) out_napi_off: napi_disable(&priv->napi[RAVB_NC]); napi_disable(&priv->napi[RAVB_BE]); - ravb_mdio_release(priv); return error; } @@ -1783,8 +1730,6 @@ static int ravb_close(struct net_device *ndev) ravb_ring_free(ndev, RAVB_BE); ravb_ring_free(ndev, RAVB_NC); - ravb_mdio_release(priv); - return 0; } @@ -1936,6 +1881,51 @@ static const struct net_device_ops ravb_netdev_ops = { .ndo_set_features = ravb_set_features, }; +/* MDIO bus init function */ +static int ravb_mdio_init(struct ravb_private *priv) +{ + struct platform_device *pdev = priv->pdev; + struct device *dev = &pdev->dev; + int error; + + /* Bitbang init */ + priv->mdiobb.ops = &bb_ops; + + /* MII controller setting */ + priv->mii_bus = alloc_mdio_bitbang(&priv->mdiobb); + if (!priv->mii_bus) + return -ENOMEM; + + /* Hook up MII support for ethtool */ + priv->mii_bus->name = "ravb_mii"; + priv->mii_bus->parent = dev; + snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x", + pdev->name, pdev->id); + + /* Register MDIO bus */ + error = of_mdiobus_register(priv->mii_bus, dev->of_node); + if (error) + goto out_free_bus; + + return 0; + +out_free_bus: + free_mdio_bitbang(priv->mii_bus); + return error; +} + +/* MDIO bus release function */ +static int ravb_mdio_release(struct ravb_private *priv) +{ + /* Unregister mdio bus */ + mdiobus_unregister(priv->mii_bus); + + /* Free bitbang info */ + free_mdio_bitbang(priv->mii_bus); + + return 0; +} + static const struct of_device_id ravb_match_table[] = { { .compatible = "renesas,etheravb-r8a7790", .data = (void *)RCAR_GEN2 }, { .compatible = "renesas,etheravb-r8a7794", .data = (void *)RCAR_GEN2 }, @@ -2176,6 +2166,13 @@ static int ravb_probe(struct platform_device *pdev) eth_hw_addr_random(ndev); } + /* MDIO bus init */ + error = ravb_mdio_init(priv); + if (error) { + dev_err(&pdev->dev, "failed to initialize MDIO\n"); + goto out_dma_free; + } + netif_napi_add(ndev, &priv->napi[RAVB_BE], ravb_poll, 64); netif_napi_add(ndev, &priv->napi[RAVB_NC], ravb_poll, 64); @@ -2197,6 +2194,8 @@ static int ravb_probe(struct platform_device *pdev) out_napi_del: netif_napi_del(&priv->napi[RAVB_NC]); netif_napi_del(&priv->napi[RAVB_BE]); + ravb_mdio_release(priv); +out_dma_free: dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat, priv->desc_bat_dma); @@ -2228,6 +2227,7 @@ static int ravb_remove(struct platform_device *pdev) unregister_netdev(ndev); netif_napi_del(&priv->napi[RAVB_NC]); netif_napi_del(&priv->napi[RAVB_BE]); + ravb_mdio_release(priv); pm_runtime_disable(&pdev->dev); free_netdev(ndev); platform_set_drvdata(pdev, NULL); From d9e81b2fb3724a0e4cc658a78b8905aabdb3ff44 Mon Sep 17 00:00:00 2001 From: Yinyin Zhu Date: Wed, 7 Oct 2020 11:16:32 +0800 Subject: [PATCH 05/86] io_uring: Fix resource leaking when kill the process The commit 1c4404efcf2c0> ("") doesn't solve the resource leak problem totally! When kworker is doing a io task for the io_uring, The process which submitted the io task has received a SIGKILL signal from the user. Then the io_cancel_async_work function could have sent a SIGINT signal to the kworker, but the judging condition is wrong. So it doesn't send a SIGINT signal to the kworker, then caused the resource leaking problem. Why the juding condition is wrong? The process is a multi-threaded process, we call the thread of the process which has submitted the io task Thread1. So the req->task is the current macro of the Thread1. when all the threads of the process have done exit procedure, the last thread will call the io_cancel_async_work, but the last thread may not the Thread1, so the task is not equal and doesn't send the SIGINT signal. To fix this bug, we alter the task attribute of the req with struct files_struct. And check the files instead. Fixes: 1c4404efcf2c0 ("io_uring: make sure async workqueue is canceled on exit") Signed-off-by: Yinyin Zhu Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a539b794f3b0c..33bd3ebdd3ef0f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -340,7 +340,7 @@ struct io_kiocb { u64 user_data; u32 result; u32 sequence; - struct task_struct *task; + struct files_struct *files; struct fs_struct *fs; @@ -514,7 +514,7 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, } } - req->task = current; + req->files = current->files; spin_lock_irqsave(&ctx->task_lock, flags); list_add(&req->task_list, &ctx->task_list); @@ -2382,6 +2382,8 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) if (ret) { struct io_ring_ctx *ctx = req->ctx; + req->files = current->files; + spin_lock_irq(&ctx->task_lock); list_add(&req->task_list, &ctx->task_list); req->work_task = NULL; @@ -3712,7 +3714,7 @@ static int io_uring_fasync(int fd, struct file *file, int on) } static void io_cancel_async_work(struct io_ring_ctx *ctx, - struct task_struct *task) + struct files_struct *files) { if (list_empty(&ctx->task_list)) return; @@ -3724,7 +3726,7 @@ static void io_cancel_async_work(struct io_ring_ctx *ctx, req = list_first_entry(&ctx->task_list, struct io_kiocb, task_list); list_del_init(&req->task_list); req->flags |= REQ_F_CANCEL; - if (req->work_task && (!task || req->task == task)) + if (req->work_task && (!files || req->files == files)) send_sig(SIGINT, req->work_task, 1); } spin_unlock_irq(&ctx->task_lock); @@ -3749,7 +3751,7 @@ static int io_uring_flush(struct file *file, void *data) struct io_ring_ctx *ctx = file->private_data; if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_cancel_async_work(ctx, current); + io_cancel_async_work(ctx, data); return 0; } From 75524f753318f61bcf974a3062973aff3a40394d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 7 Oct 2020 11:16:33 +0800 Subject: [PATCH 06/86] io_uring: Fix missing smp_mb() in io_cancel_async_work() The store to req->flags and load req->work_task should not be reordering in io_cancel_async_work(). We should make sure that either we store REQ_F_CANCE flag to req->flags or we see the req->work_task setted in io_sq_wq_submit_work(). Fixes: 1c4404efcf2c ("io_uring: make sure async workqueue is canceled on exit") Signed-off-by: Muchun Song Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 33bd3ebdd3ef0f..27770af20d240f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2247,6 +2247,12 @@ static void io_sq_wq_submit_work(struct work_struct *work) if (!ret) { req->work_task = current; + + /* + * Pairs with the smp_store_mb() (B) in + * io_cancel_async_work(). + */ + smp_mb(); /* A */ if (req->flags & REQ_F_CANCEL) { ret = -ECANCELED; goto end_req; @@ -3725,7 +3731,15 @@ static void io_cancel_async_work(struct io_ring_ctx *ctx, req = list_first_entry(&ctx->task_list, struct io_kiocb, task_list); list_del_init(&req->task_list); - req->flags |= REQ_F_CANCEL; + + /* + * The below executes an smp_mb(), which matches with the + * smp_mb() (A) in io_sq_wq_submit_work() such that either + * we store REQ_F_CANCEL flag to req->flags or we see the + * req->work_task setted in io_sq_wq_submit_work(). + */ + smp_store_mb(req->flags, req->flags | REQ_F_CANCEL); /* B */ + if (req->work_task && (!files || req->files == files)) send_sig(SIGINT, req->work_task, 1); } From efb1cef27d591db4a6258fed069163aef50ca7dd Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 7 Oct 2020 11:16:34 +0800 Subject: [PATCH 07/86] io_uring: Fix remove irrelevant req from the task_list If the process 0 has been initialized io_uring is complete, and then fork process 1. If process 1 exits and it leads to delete all reqs from the task_list. If we kill process 0. We will not send SIGINT signal to the kworker. So we can not remove the req from the task_list. The io_sq_wq_submit_work() can do that for us. Fixes: 1c4404efcf2c ("io_uring: make sure async workqueue is canceled on exit") Signed-off-by: Muchun Song Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 27770af20d240f..242612fe9b4844 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2272,13 +2272,11 @@ static void io_sq_wq_submit_work(struct work_struct *work) break; cond_resched(); } while (1); -end_req: - if (!list_empty(&req->task_list)) { - spin_lock_irq(&ctx->task_lock); - list_del_init(&req->task_list); - spin_unlock_irq(&ctx->task_lock); - } } +end_req: + spin_lock_irq(&ctx->task_lock); + list_del_init(&req->task_list); + spin_unlock_irq(&ctx->task_lock); /* drop submission reference */ io_put_req(req); @@ -3722,15 +3720,16 @@ static int io_uring_fasync(int fd, struct file *file, int on) static void io_cancel_async_work(struct io_ring_ctx *ctx, struct files_struct *files) { + struct io_kiocb *req; + if (list_empty(&ctx->task_list)) return; spin_lock_irq(&ctx->task_lock); - while (!list_empty(&ctx->task_list)) { - struct io_kiocb *req; - req = list_first_entry(&ctx->task_list, struct io_kiocb, task_list); - list_del_init(&req->task_list); + list_for_each_entry(req, &ctx->task_list, task_list) { + if (files && req->files != files) + continue; /* * The below executes an smp_mb(), which matches with the @@ -3740,7 +3739,7 @@ static void io_cancel_async_work(struct io_ring_ctx *ctx, */ smp_store_mb(req->flags, req->flags | REQ_F_CANCEL); /* B */ - if (req->work_task && (!files || req->files == files)) + if (req->work_task) send_sig(SIGINT, req->work_task, 1); } spin_unlock_irq(&ctx->task_lock); From ee413b2915bfdcb83972f9fd426409600eedc309 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 7 Oct 2020 11:16:35 +0800 Subject: [PATCH 08/86] io_uring: Fix double list add in io_queue_async_work() If we queue work in io_poll_wake(), it will leads to list double add. So we should add the list when the callback func is the io_sq_wq_submit_work. The following oops was seen: list_add double add: new=ffff9ca6a8f1b0e0, prev=ffff9ca62001cee8, next=ffff9ca6a8f1b0e0. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:31! Call Trace: io_poll_wake+0xf3/0x230 __wake_up_common+0x91/0x170 __wake_up_common_lock+0x7a/0xc0 io_commit_cqring+0xea/0x280 ? blkcg_iolatency_done_bio+0x2b/0x610 io_cqring_add_event+0x3e/0x60 io_complete_rw+0x58/0x80 dio_complete+0x106/0x250 blk_update_request+0xa0/0x3b0 blk_mq_end_request+0x1a/0x110 blk_mq_complete_request+0xd0/0xe0 nvme_irq+0x129/0x270 [nvme] __handle_irq_event_percpu+0x7b/0x190 handle_irq_event_percpu+0x30/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x91/0x1e0 do_IRQ+0x4d/0xd0 common_interrupt+0xf/0xf Fixes: 1c4404efcf2c ("io_uring: make sure async workqueue is canceled on exit") Reported-by: Jiachen Zhang Signed-off-by: Muchun Song Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 242612fe9b4844..4127ea027a14dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -514,12 +514,14 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, } } - req->files = current->files; + if (req->work.func == io_sq_wq_submit_work) { + req->files = current->files; - spin_lock_irqsave(&ctx->task_lock, flags); - list_add(&req->task_list, &ctx->task_list); - req->work_task = NULL; - spin_unlock_irqrestore(&ctx->task_lock, flags); + spin_lock_irqsave(&ctx->task_lock, flags); + list_add(&req->task_list, &ctx->task_list); + req->work_task = NULL; + spin_unlock_irqrestore(&ctx->task_lock, flags); + } queue_work(ctx->sqo_wq[rw], &req->work); } @@ -668,6 +670,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, state->cur_req++; } + INIT_LIST_HEAD(&req->task_list); req->file = NULL; req->ctx = ctx; req->flags = 0; From 8ece83bf754f758bf457c2b4ef0a9d456da3d0c6 Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Wed, 7 Oct 2020 09:24:01 +0530 Subject: [PATCH 09/86] net: wireless: nl80211: fix out-of-bounds access in nl80211_del_key() commit 3dc289f8f139997f4e9d3cfccf8738f20d23e47b upstream. In nl80211_parse_key(), key.idx is first initialized as -1. If this value of key.idx remains unmodified and gets returned, and nl80211_key_allowed() also returns 0, then rdev_del_key() gets called with key.idx = -1. This causes an out-of-bounds array access. Handle this issue by checking if the value of key.idx after nl80211_parse_key() is called and return -EINVAL if key.idx < 0. Cc: stable@vger.kernel.org Reported-by: syzbot+b1bb342d1d097516cbda@syzkaller.appspotmail.com Tested-by: syzbot+b1bb342d1d097516cbda@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Link: https://lore.kernel.org/r/20201007035401.9522-1-anant.thazhemadam@gmail.com Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ec559dbad56ea2..7bc4f37655237e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3975,6 +3975,9 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (key.idx < 0) + return -EINVAL; + if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); From 456d77c1bdfa560ead369cb092e14715e2d3573d Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 7 Oct 2020 00:05:28 +0200 Subject: [PATCH 10/86] drm/nouveau/mem: guard against NULL pointer access in mem_del commit d10285a25e29f13353bbf7760be8980048c1ef2f upstream. other drivers seems to do something similar Signed-off-by: Karol Herbst Cc: dri-devel Cc: Dave Airlie Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20201006220528.13925-2-kherbst@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nouveau_mem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c index c002f896850739..9682f30ab6f68a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_mem.c +++ b/drivers/gpu/drm/nouveau/nouveau_mem.c @@ -176,6 +176,8 @@ void nouveau_mem_del(struct ttm_mem_reg *reg) { struct nouveau_mem *mem = nouveau_mem(reg); + if (!mem) + return; nouveau_mem_fini(mem); kfree(reg->mm_node); reg->mm_node = NULL; From 57b47abc1a4a337e009a05845b2df7d9fc2103e4 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Sat, 3 Oct 2020 12:01:52 +0200 Subject: [PATCH 11/86] vhost: Don't call access_ok() when using IOTLB commit 0210a8db2aeca393fb3067e234967877e3146266 upstream. When the IOTLB device is enabled, the vring addresses we get from userspace are GIOVAs. It is thus wrong to pass them down to access_ok() which only takes HVAs. Access validation is done at prefetch time with IOTLB. Teach vq_access_ok() about that by moving the (vq->iotlb) check from vhost_vq_access_ok() to vq_access_ok(). This prevents vhost_vring_set_addr() to fail when verifying the accesses. No behavior change for vhost_vq_access_ok(). BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1883084 Fixes: 6b1e6cc7855b ("vhost: new device IOTLB API") Cc: jasowang@redhat.com CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Greg Kurz Acked-by: Jason Wang Link: https://lore.kernel.org/r/160171931213.284610.2052489816407219136.stgit@bahia.lan Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/vhost.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 36ca2cf419bfeb..ef0b7e18f35eeb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1299,6 +1299,11 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, struct vring_used __user *used) { + /* If an IOTLB device is present, the vring addresses are + * GIOVAs. Access validation occurs at prefetch time. */ + if (vq->iotlb) + return true; + return access_ok(desc, vhost_get_desc_size(vq, num)) && access_ok(avail, vhost_get_avail_size(vq, num)) && access_ok(used, vhost_get_used_size(vq, num)); @@ -1394,10 +1399,6 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq) if (!vq_log_access_ok(vq, vq->log_base)) return false; - /* Access validation occurs at prefetch time with IOTLB */ - if (vq->iotlb) - return true; - return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); From 920a61ddd3b5787957c1dec7a84b426e629b5051 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Sat, 3 Oct 2020 12:02:03 +0200 Subject: [PATCH 12/86] vhost: Use vhost_get_used_size() in vhost_vring_set_addr() commit 71878fa46c7e3b40fa7b3f1b6e4ba3f92f1ac359 upstream. The open-coded computation of the used size doesn't take the event into account when the VIRTIO_RING_F_EVENT_IDX feature is present. Fix that by using vhost_get_used_size(). Fixes: 8ea8cf89e19a ("vhost: support event index") Cc: stable@vger.kernel.org Signed-off-by: Greg Kurz Link: https://lore.kernel.org/r/160171932300.284610.11846106312938909461.stgit@bahia.lan Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/vhost.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ef0b7e18f35eeb..57ab79fbcee95f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1545,8 +1545,7 @@ static long vhost_vring_set_addr(struct vhost_dev *d, /* Also validate log access for used ring if enabled. */ if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && !log_access_ok(vq->log_base, a.log_guest_addr, - sizeof *vq->used + - vq->num * sizeof *vq->used->ring)) + vhost_get_used_size(vq, vq->num))) return -EINVAL; } From ce8432912f1b9d82cf570b581ed818f34503c850 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 5 Oct 2020 10:56:22 -0700 Subject: [PATCH 13/86] usermodehelper: reset umask to default before executing user process commit 4013c1496c49615d90d36b9d513eee8e369778e9 upstream. Kernel threads intentionally do CLONE_FS in order to follow any changes that 'init' does to set up the root directory (or cwd). It is admittedly a bit odd, but it avoids the situation where 'init' does some extensive setup to initialize the system environment, and then we execute a usermode helper program, and it uses the original FS setup from boot time that may be very limited and incomplete. [ Both Al Viro and Eric Biederman point out that 'pivot_root()' will follow the root regardless, since it fixes up other users of root (see chroot_fs_refs() for details), but overmounting root and doing a chroot() would not. ] However, Vegard Nossum noticed that the CLONE_FS not only means that we follow the root and current working directories, it also means we share umask with whatever init changed it to. That wasn't intentional. Just reset umask to the original default (0022) before actually starting the usermode helper program. Reported-by: Vegard Nossum Cc: Al Viro Acked-by: Eric W. Biederman Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/umh.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/umh.c b/kernel/umh.c index 3474d6aa55d832..b8c524dcc76f1e 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,14 @@ static int call_usermodehelper_exec_async(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); + /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. From 2293272345ff9942531657839fe6a2650cb9e3e0 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 23 Aug 2020 19:12:11 +0800 Subject: [PATCH 14/86] Platform: OLPC: Fix memleak in olpc_ec_probe commit 4fd9ac6bd3044734a7028bd993944c3617d1eede upstream. When devm_regulator_register() fails, ec should be freed just like when olpc_ec_cmd() fails. Fixes: 231c0c216172a ("Platform: OLPC: Add a regulator for the DCON") Signed-off-by: Dinghao Liu Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/platform/olpc/olpc-ec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c index 190e4a6186ef7f..f64b82824db282 100644 --- a/drivers/platform/olpc/olpc-ec.c +++ b/drivers/platform/olpc/olpc-ec.c @@ -439,7 +439,9 @@ static int olpc_ec_probe(struct platform_device *pdev) &config); if (IS_ERR(ec->dcon_rdev)) { dev_err(&pdev->dev, "failed to register DCON regulator\n"); - return PTR_ERR(ec->dcon_rdev); + err = PTR_ERR(ec->dcon_rdev); + kfree(ec); + return err; } ec->dbgfs_dir = olpc_ec_setup_debugfs(); From d101961ce588aae29b2e00f1ef44e94e4dcfd2b9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 12 Sep 2020 11:35:32 +0200 Subject: [PATCH 15/86] platform/x86: intel-vbtn: Fix SW_TABLET_MODE always reporting 1 on the HP Pavilion 11 x360 commit d823346876a970522ff9e4d2b323c9b734dcc4de upstream. Commit cfae58ed681c ("platform/x86: intel-vbtn: Only blacklist SW_TABLET_MODE on the 9 / "Laptop" chasis-type") restored SW_TABLET_MODE reporting on the HP stream x360 11 series on which it was previously broken by commit de9647efeaa9 ("platform/x86: intel-vbtn: Only activate tablet mode switch on 2-in-1's"). It turns out that enabling SW_TABLET_MODE reporting on devices with a chassis-type of 10 ("Notebook") causes SW_TABLET_MODE to always report 1 at boot on the HP Pavilion 11 x360, which causes libinput to disable the kbd and touchpad. The HP Pavilion 11 x360's ACPI VGBS method sets bit 4 instead of bit 6 when NOT in tablet mode at boot. Inspecting all the DSDTs in my DSDT collection shows only one other model, the Medion E1239T ever setting bit 4 and it always sets this together with bit 6. So lets treat bit 4 as a second bit which when set indicates the device not being in tablet-mode, as we already do for bit 6. While at it also prefix all VGBS constant defines with "VGBS_". Fixes: cfae58ed681c ("platform/x86: intel-vbtn: Only blacklist SW_TABLET_MODE on the 9 / "Laptop" chasis-type") Signed-off-by: Hans de Goede Acked-by: Mark Gross Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/intel-vbtn.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel-vbtn.c b/drivers/platform/x86/intel-vbtn.c index 3393ee95077f6c..226a477bf3a585 100644 --- a/drivers/platform/x86/intel-vbtn.c +++ b/drivers/platform/x86/intel-vbtn.c @@ -15,9 +15,13 @@ #include #include +/* Returned when NOT in tablet mode on some HP Stream x360 11 models */ +#define VGBS_TABLET_MODE_FLAG_ALT 0x10 /* When NOT in tablet mode, VGBS returns with the flag 0x40 */ -#define TABLET_MODE_FLAG 0x40 -#define DOCK_MODE_FLAG 0x80 +#define VGBS_TABLET_MODE_FLAG 0x40 +#define VGBS_DOCK_MODE_FLAG 0x80 + +#define VGBS_TABLET_MODE_FLAGS (VGBS_TABLET_MODE_FLAG | VGBS_TABLET_MODE_FLAG_ALT) MODULE_LICENSE("GPL"); MODULE_AUTHOR("AceLan Kao"); @@ -148,9 +152,9 @@ static void detect_tablet_mode(struct platform_device *device) if (ACPI_FAILURE(status)) return; - m = !(vgbs & TABLET_MODE_FLAG); + m = !(vgbs & VGBS_TABLET_MODE_FLAGS); input_report_switch(priv->input_dev, SW_TABLET_MODE, m); - m = (vgbs & DOCK_MODE_FLAG) ? 1 : 0; + m = (vgbs & VGBS_DOCK_MODE_FLAG) ? 1 : 0; input_report_switch(priv->input_dev, SW_DOCK, m); } From 9bd694ccfd44d6b4b5ae004ea0a1bfe3f13b7f35 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 13 Sep 2020 12:02:03 -0700 Subject: [PATCH 16/86] platform/x86: thinkpad_acpi: initialize tp_nvram_state variable commit 5f38b06db8af3ed6c2fc1b427504ca56fae2eacc upstream. clang static analysis flags this represenative problem thinkpad_acpi.c:2523:7: warning: Branch condition evaluates to a garbage value if (!oldn->mute || ^~~~~~~~~~~ In hotkey_kthread() mute is conditionally set by hotkey_read_nvram() but unconditionally checked by hotkey_compare_and_issue_event(). So the tp_nvram_state variable s[2] needs to be initialized. Fixes: 01e88f25985d ("ACPI: thinkpad-acpi: add CMOS NVRAM polling for hot keys (v9)") Signed-off-by: Tom Rix Reviewed-by: Hans de Goede Acked-by: mark gross Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/thinkpad_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index da794dcfdd928d..d1efede5fbe5fb 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -2587,7 +2587,7 @@ static void hotkey_compare_and_issue_event(struct tp_nvram_state *oldn, */ static int hotkey_kthread(void *data) { - struct tp_nvram_state s[2]; + struct tp_nvram_state s[2] = { 0 }; u32 poll_mask, event_mask; unsigned int si, so; unsigned long t; From 67a57230b4bfb377cd479c9b32d1e2ab70bc96d1 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:33 -0700 Subject: [PATCH 17/86] bpf: Fix sysfs export of empty BTF section commit e23bb04b0c938588eae41b7f4712b722290ed2b8 upstream. If BTF data is missing or removed from the ELF section it is still exported via sysfs as a zero-length file: root@OpenWrt:/# ls -l /sys/kernel/btf/vmlinux -r--r--r-- 1 root root 0 Jul 18 02:59 /sys/kernel/btf/vmlinux Moreover, reads from this file succeed and leak kernel data: root@OpenWrt:/# hexdump -C /sys/kernel/btf/vmlinux|head -10 000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 000cc0 00 00 00 00 00 00 00 00 00 00 00 00 80 83 b0 80 |................| 000cd0 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| 000ce0 00 00 00 00 00 00 00 00 00 00 00 00 57 ac 6e 9d |............W.n.| 000cf0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 002650 00 00 00 00 00 00 00 10 00 00 00 01 00 00 00 01 |................| 002660 80 82 9a c4 80 85 97 80 81 a9 51 68 00 00 00 02 |..........Qh....| 002670 80 25 44 dc 80 85 97 80 81 a9 50 24 81 ab c4 60 |.%D.......P$...`| This situation was first observed with kernel 5.4.x, cross-compiled for a MIPS target system. Fix by adding a sanity-check for export of zero-length data sections. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b38db205a66238f70823039a8c531535864eaac5.1600417359.git.Tony.Ambardar@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/sysfs_btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5aef..11b3380887fa04 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } From 6440fb9bda91c7bddad15d5dc9b0dd7fec652174 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:34 -0700 Subject: [PATCH 18/86] bpf: Prevent .BTF section elimination commit 65c204398928f9c79f1a29912b410439f7052635 upstream. Systems with memory or disk constraints often reduce the kernel footprint by configuring LD_DEAD_CODE_DATA_ELIMINATION. However, this can result in removal of any BTF information. Use the KEEP() macro to preserve the BTF data as done with other important sections, while still allowing for smaller kernels. Fixes: 90ceddcb4950 ("bpf: Support llvm-objcopy for vmlinux BTF") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/a635b5d3e2da044e7b51ec1315e8910fbce0083f.1600417359.git.Tony.Ambardar@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f050039ca2c07a..e5e2425875953b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -599,7 +599,7 @@ #define BTF \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ __start_BTF = .; \ - *(.BTF) \ + KEEP(*(.BTF)) \ __stop_BTF = .; \ } #else From da4cdc87dfeb596da0fa2e14d70e4f2648bee839 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 30 Sep 2020 15:19:05 +0200 Subject: [PATCH 19/86] platform/x86: intel-vbtn: Switch to an allow-list for SW_TABLET_MODE reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8169bd3e6e193497cab781acddcff8fde5d0c416 upstream. 2 recent commits: cfae58ed681c ("platform/x86: intel-vbtn: Only blacklist SW_TABLET_MODE on the 9 / "Laptop" chasis-type") 1fac39fd0316 ("platform/x86: intel-vbtn: Also handle tablet-mode switch on "Detachable" and "Portable" chassis-types") Enabled reporting of SW_TABLET_MODE on more devices since the vbtn ACPI interface is used by the firmware on some of those devices to report this. Testing has shown that unconditionally enabling SW_TABLET_MODE reporting on all devices with a chassis type of 8 ("Portable") or 10 ("Notebook") which support the VGBS method is a very bad idea. Many of these devices are normal laptops (non 2-in-1) models with a VGBS which always returns 0, which we translate to SW_TABLET_MODE=1. This in turn causes userspace (libinput) to suppress events from the builtin keyboard and touchpad, making the laptop essentially unusable. Since the problem of wrongly reporting SW_TABLET_MODE=1 in combination with libinput, leads to a non-usable system. Where as OTOH many people will not even notice when SW_TABLET_MODE is not being reported, this commit changes intel_vbtn_has_switches() to use a DMI based allow-list. The new DMI based allow-list matches on the 31 ("Convertible") and 32 ("Detachable") chassis-types, as these clearly are 2-in-1s and so far if they support the intel-vbtn ACPI interface they all have properly working SW_TABLET_MODE reporting. Besides these 2 generic matches, it also contains model specific matches for 2-in-1 models which use a different chassis-type and which are known to have properly working SW_TABLET_MODE reporting. This has been tested on the following 2-in-1 devices: Dell Venue 11 Pro 7130 vPro HP Pavilion X2 10-p002nd HP Stream x360 Convertible PC 11 Medion E1239T Fixes: cfae58ed681c ("platform/x86: intel-vbtn: Only blacklist SW_TABLET_MODE on the 9 / "Laptop" chasis-type") BugLink: https://forum.manjaro.org/t/keyboard-and-touchpad-only-work-on-kernel-5-6/22668 BugLink: https://bugzilla.opensuse.org/show_bug.cgi?id=1175599 Cc: Barnabás Pőcze Cc: Takashi Iwai Signed-off-by: Hans de Goede Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/intel-vbtn.c | 52 +++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/intel-vbtn.c b/drivers/platform/x86/intel-vbtn.c index 226a477bf3a585..5c103614a409ac 100644 --- a/drivers/platform/x86/intel-vbtn.c +++ b/drivers/platform/x86/intel-vbtn.c @@ -158,20 +158,54 @@ static void detect_tablet_mode(struct platform_device *device) input_report_switch(priv->input_dev, SW_DOCK, m); } +/* + * There are several laptops (non 2-in-1) models out there which support VGBS, + * but simply always return 0, which we translate to SW_TABLET_MODE=1. This in + * turn causes userspace (libinput) to suppress events from the builtin + * keyboard and touchpad, making the laptop essentially unusable. + * + * Since the problem of wrongly reporting SW_TABLET_MODE=1 in combination + * with libinput, leads to a non-usable system. Where as OTOH many people will + * not even notice when SW_TABLET_MODE is not being reported, a DMI based allow + * list is used here. This list mainly matches on the chassis-type of 2-in-1s. + * + * There are also some 2-in-1s which use the intel-vbtn ACPI interface to report + * SW_TABLET_MODE with a chassis-type of 8 ("Portable") or 10 ("Notebook"), + * these are matched on a per model basis, since many normal laptops with a + * possible broken VGBS ACPI-method also use these chassis-types. + */ +static const struct dmi_system_id dmi_switches_allow_list[] = { + { + .matches = { + DMI_EXACT_MATCH(DMI_CHASSIS_TYPE, "31" /* Convertible */), + }, + }, + { + .matches = { + DMI_EXACT_MATCH(DMI_CHASSIS_TYPE, "32" /* Detachable */), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Venue 11 Pro 7130"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Stream x360 Convertible PC 11"), + }, + }, + {} /* Array terminator */ +}; + static bool intel_vbtn_has_switches(acpi_handle handle) { - const char *chassis_type = dmi_get_system_info(DMI_CHASSIS_TYPE); unsigned long long vgbs; acpi_status status; - /* - * Some normal laptops have a VGBS method despite being non-convertible - * and their VGBS method always returns 0, causing detect_tablet_mode() - * to report SW_TABLET_MODE=1 to userspace, which causes issues. - * These laptops have a DMI chassis_type of 9 ("Laptop"), do not report - * switches on any devices with a DMI chassis_type of 9. - */ - if (chassis_type && strcmp(chassis_type, "9") == 0) + if (!dmi_check_system(dmi_switches_allow_list)) return false; status = acpi_evaluate_integer(handle, "VGBS", NULL, &vgbs); From 3fd2647f9d687295c96865d0713e20c79924c6d5 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Sat, 3 Oct 2020 01:09:16 +0800 Subject: [PATCH 20/86] platform/x86: thinkpad_acpi: re-initialize ACPI buffer size when reuse commit 720ef73d1a239e33c3ad8fac356b9b1348e68aaf upstream. Evaluating ACPI _BCL could fail, then ACPI buffer size will be set to 0. When reuse this ACPI buffer, AE_BUFFER_OVERFLOW will be triggered. Re-initialize buffer size will make ACPI evaluate successfully. Fixes: 46445b6b896fd ("thinkpad-acpi: fix handle locate for video and query of _BCL") Signed-off-by: Aaron Ma Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/thinkpad_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index d1efede5fbe5fb..abcb336a515a10 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -6863,8 +6863,10 @@ static int __init tpacpi_query_bcl_levels(acpi_handle handle) list_for_each_entry(child, &device->children, node) { acpi_status status = acpi_evaluate_object(child->handle, "_BCL", NULL, &buffer); - if (ACPI_FAILURE(status)) + if (ACPI_FAILURE(status)) { + buffer.length = ACPI_ALLOCATE_BUFFER; continue; + } obj = (union acpi_object *)buffer.pointer; if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) { From 7c1847aa4932d988af89a12b5bc63a444ec5c5df Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 13 Jul 2020 11:12:54 +0900 Subject: [PATCH 21/86] driver core: Fix probe_count imbalance in really_probe() commit b292b50b0efcc7095d8bf15505fba6909bb35dce upstream. syzbot is reporting hung task in wait_for_device_probe() [1]. At least, we always need to decrement probe_count if we incremented probe_count in really_probe(). However, since I can't find "Resources present before probing" message in the console log, both "this message simply flowed off" and "syzbot is not hitting this path" will be possible. Therefore, while we are at it, let's also prepare for concurrent wait_for_device_probe() calls by replacing wake_up() with wake_up_all(). [1] https://syzkaller.appspot.com/bug?id=25c833f1983c9c1d512f4ff860dd0d7f5a2e2c0f Reported-by: syzbot Fixes: 7c35e699c88bd607 ("driver core: Print device when resources present in really_probe()") Cc: Geert Uytterhoeven Signed-off-by: Tetsuo Handa Cc: stable Link: https://lore.kernel.org/r/20200713021254.3444-1-penguin-kernel@I-love.SAKURA.ne.jp [iwamatsu: Drop patch for deferred_probe_timeout_work_func()] Signed-off-by: Nobuhiro Iwamatsu (CIP) Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 1d5dd37f3abe46..84e757860ebb94 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -518,7 +518,8 @@ static int really_probe(struct device *dev, struct device_driver *drv) drv->bus->name, __func__, drv->name, dev_name(dev)); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); - return -EBUSY; + ret = -EBUSY; + goto done; } re_probe: @@ -639,7 +640,7 @@ static int really_probe(struct device *dev, struct device_driver *drv) ret = 0; done: atomic_dec(&probe_count); - wake_up(&probe_waitqueue); + wake_up_all(&probe_waitqueue); return ret; } From 2499c15115ac2e426df5f1075f3968293d1d5f3f Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 23 Apr 2020 14:53:40 +0300 Subject: [PATCH 22/86] perf test session topology: Fix data path commit dbd660e6b2884b864d2642d930a163d3bcebe4be upstream. Commit 2d4f27999b88 ("perf data: Add global path holder") missed path conversion in tests/topology.c, causing the "Session topology" testcase to "hang" (waits forever for input from stdin) when doing "ssh $VM perf test". Can be reproduced by running "cat | perf test topo", and crashed by replacing cat with true: $ true | perf test -v topo 40: Session topology : --- start --- test child forked, pid 3638 templ file: /tmp/perf-test-QPvAch incompatible file format incompatible file format (rerun with -v to learn more) free(): invalid pointer test child interrupted ---- end ---- Session topology: FAILED! Committer testing: Reproduced the above result before the patch and after it is back working: # true | perf test -v topo 41: Session topology : --- start --- test child forked, pid 19374 templ file: /tmp/perf-test-YOTEQg CPU 0, core 0, socket 0 CPU 1, core 1, socket 0 CPU 2, core 2, socket 0 CPU 3, core 3, socket 0 CPU 4, core 0, socket 0 CPU 5, core 1, socket 0 CPU 6, core 2, socket 0 CPU 7, core 3, socket 0 test child finished with 0 ---- end ---- Session topology: Ok # Fixes: 2d4f27999b88 ("perf data: Add global path holder") Signed-off-by: Tommi Rantala Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mamatha Inamdar Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: http://lore.kernel.org/lkml/20200423115341.562782-1-tommi.t.rantala@nokia.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/tests/topology.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 4a800499d7c35f..22daf2bdf5faf1 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -33,10 +33,8 @@ static int session_write_header(char *path) { struct perf_session *session; struct perf_data data = { - .file = { - .path = path, - }, - .mode = PERF_DATA_MODE_WRITE, + .path = path, + .mode = PERF_DATA_MODE_WRITE, }; session = perf_session__new(&data, false, NULL); @@ -63,10 +61,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) { struct perf_session *session; struct perf_data data = { - .file = { - .path = path, - }, - .mode = PERF_DATA_MODE_READ, + .path = path, + .mode = PERF_DATA_MODE_READ, }; int i; From 2118c7ba5f2a08df126d59006e2cf8f7cd68aecb Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 5 Mar 2020 10:37:12 +0200 Subject: [PATCH 23/86] perf top: Fix stdio interface input handling with glibc 2.28+ commit 29b4f5f188571c112713c35cc87eefb46efee612 upstream. Since glibc 2.28 when running 'perf top --stdio', input handling no longer works, but hitting any key always just prints the "Mapped keys" help text. To fix it, call clearerr() in the display_thread() loop to clear any EOF sticky errors, as instructed in the glibc NEWS file (https://sourceware.org/git/?p=glibc.git;a=blob;f=NEWS): * All stdio functions now treat end-of-file as a sticky condition. If you read from a file until EOF, and then the file is enlarged by another process, you must call clearerr or another function with the same effect (e.g. fseek, rewind) before you can read the additional data. This corrects a longstanding C99 conformance bug. It is most likely to affect programs that use stdio to read interactive input from a terminal. (Bug #1190.) Signed-off-by: Tommi Rantala Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200305083714.9381-2-tommi.t.rantala@nokia.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/builtin-top.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1f60124eb19bb4..a30d62186f5e9e 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -683,7 +683,9 @@ static void *display_thread(void *arg) delay_msecs = top->delay_secs * MSEC_PER_SEC; set_term_quiet_input(&save); /* trash return*/ - getc(stdin); + clearerr(stdin); + if (poll(&stdin_poll, 1, 0) > 0) + getc(stdin); while (!done) { perf_top__print_sym_table(top); From e8e1d16e0b89be064bd97094af58dee7502858c7 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 10 Sep 2020 11:57:08 +0200 Subject: [PATCH 24/86] i2c: i801: Exclude device from suspend direct complete optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 845b89127bc5458d0152a4d63f165c62a22fcb70 upstream. By default, PCI drivers with runtime PM enabled will skip the calls to suspend and resume on system PM. For this driver, we don't want that, as we need to perform additional steps for system PM to work properly on all systems. So instruct the PM core to not skip these calls. Fixes: a9c8088c7988 ("i2c: i801: Don't restore config registers on runtime PM") Reported-by: Volker Rümelin Signed-off-by: Jean Delvare Cc: stable@vger.kernel.org Signed-off-by: Wolfram Sang [iwamatsu: Use DPM_FLAG_NEVER_SKIP instead of DPM_FLAG_NO_DIRECT_COMPLETE] Signed-off-by: Nobuhiro Iwamatsu (CIP) Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 9a80c3c7e8af27..c40eef4e7a9858 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1891,6 +1891,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_set_drvdata(dev, priv); + dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_NEVER_SKIP); pm_runtime_set_autosuspend_delay(&dev->dev, 1000); pm_runtime_use_autosuspend(&dev->dev); pm_runtime_put_autosuspend(&dev->dev); From f02dc39bbb2020de61925b2641661b7347ff4526 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Mon, 29 Jun 2020 11:25:43 -0500 Subject: [PATCH 25/86] arm64: dts: stratix10: add status to qspi dts node commit 263a0269a59c0b4145829462a107fe7f7327105f upstream. Add status = "okay" to QSPI node. Fixes: 0cb140d07fc75 ("arm64: dts: stratix10: Add QSPI support for Stratix10") Cc: linux-stable # >= v5.6 Signed-off-by: Dinh Nguyen [iwamatsu: Drop arch/arm64/boot/dts/altera/socfpga_stratix10_socdk_nand.dts] Signed-off-by: Nobuhiro Iwamatsu Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts index 66e4ffb4e929d2..2c8c2b322c727e 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts @@ -155,6 +155,7 @@ }; &qspi { + status = "okay"; flash@0 { #address-cells = <1>; #size-cells = <1>; From 19d8412679f2c73b880ab3ff78ad2e6b7589f8a5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 30 Oct 2019 12:23:11 +0000 Subject: [PATCH 26/86] Btrfs: send, allow clone operations within the same file commit 11f2069c113e02971b8db6fda62f9b9cd31a030f upstream. For send we currently skip clone operations when the source and destination files are the same. This is so because clone didn't support this case in its early days, but support for it was added back in May 2013 by commit a96fbc72884fcb ("Btrfs: allow file data clone within a file"). This change adds support for it. Example: $ mkfs.btrfs -f /dev/sdd $ mount /dev/sdd /mnt/sdd $ xfs_io -f -c "pwrite -S 0xab -b 64K 0 64K" /mnt/sdd/foobar $ xfs_io -c "reflink /mnt/sdd/foobar 0 64K 64K" /mnt/sdd/foobar $ btrfs subvolume snapshot -r /mnt/sdd /mnt/sdd/snap $ mkfs.btrfs -f /dev/sde $ mount /dev/sde /mnt/sde $ btrfs send /mnt/sdd/snap | btrfs receive /mnt/sde Without this change file foobar at the destination has a single 128Kb extent: $ filefrag -v /mnt/sde/snap/foobar Filesystem type is: 9123683e File size of /mnt/sde/snap/foobar is 131072 (32 blocks of 4096 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 31: 0.. 31: 32: last,unknown_loc,delalloc,eof /mnt/sde/snap/foobar: 1 extent found With this we get a single 64Kb extent that is shared at file offsets 0 and 64K, just like in the source filesystem: $ filefrag -v /mnt/sde/snap/foobar Filesystem type is: 9123683e File size of /mnt/sde/snap/foobar is 131072 (32 blocks of 4096 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 15: 3328.. 3343: 16: shared 1: 16.. 31: 3328.. 3343: 16: 3344: last,shared,eof /mnt/sde/snap/foobar: 2 extents found Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6ad216e8178e8d..48d028b1cfafde 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1257,12 +1257,20 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (found->root == bctx->sctx->send_root) { /* - * TODO for the moment we don't accept clones from the inode - * that is currently send. We may change this when - * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same - * file. + * If the source inode was not yet processed we can't issue a + * clone operation, as the source extent does not exist yet at + * the destination of the stream. */ - if (ino >= bctx->cur_objectid) + if (ino > bctx->cur_objectid) + return 0; + /* + * We clone from the inode currently being sent as long as the + * source extent is already processed, otherwise we could try + * to clone from an extent that does not exist yet at the + * destination of the stream. + */ + if (ino == bctx->cur_objectid && + offset >= bctx->sctx->cur_inode_next_write_offset) return 0; } From 5aefd1fa9f4d7b5765c0969d8a47e426c64c8687 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 29 Jan 2020 17:09:53 +0000 Subject: [PATCH 27/86] Btrfs: send, fix emission of invalid clone operations within the same file commit 9722b10148504c4153a74a9c89725af271e490fc upstream. When doing an incremental send and a file has extents shared with itself at different file offsets, it's possible for send to emit clone operations that will fail at the destination because the source range goes beyond the file's current size. This happens when the file size has increased in the send snapshot, there is a hole between the shared extents and both shared extents are at file offsets which are greater the file's size in the parent snapshot. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt/sdb $ xfs_io -f -c "pwrite -S 0xf1 0 64K" /mnt/sdb/foobar $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/base $ btrfs send -f /tmp/1.snap /mnt/sdb/base # Create a 320K extent at file offset 512K. $ xfs_io -c "pwrite -S 0xab 512K 64K" /mnt/sdb/foobar $ xfs_io -c "pwrite -S 0xcd 576K 64K" /mnt/sdb/foobar $ xfs_io -c "pwrite -S 0xef 640K 64K" /mnt/sdb/foobar $ xfs_io -c "pwrite -S 0x64 704K 64K" /mnt/sdb/foobar $ xfs_io -c "pwrite -S 0x73 768K 64K" /mnt/sdb/foobar # Clone part of that 320K extent into a lower file offset (192K). # This file offset is greater than the file's size in the parent # snapshot (64K). Also the clone range is a bit behind the offset of # the 320K extent so that we leave a hole between the shared extents. $ xfs_io -c "reflink /mnt/sdb/foobar 448K 192K 192K" /mnt/sdb/foobar $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/incr $ btrfs send -p /mnt/sdb/base -f /tmp/2.snap /mnt/sdb/incr $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ btrfs receive -f /tmp/1.snap /mnt/sdc $ btrfs receive -f /tmp/2.snap /mnt/sdc ERROR: failed to clone extents to foobar: Invalid argument The problem is that after processing the extent at file offset 256K, which refers to the first 128K of the 320K extent created by the buffered write operations, we have 'cur_inode_next_write_offset' set to 384K, which corresponds to the end offset of the partially shared extent (256K + 128K) and to the current file size in the receiver. Then when we process the extent at offset 512K, we do extent backreference iteration to figure out if we can clone the extent from some other inode or from the same inode, and we consider the extent at offset 256K of the same inode as a valid source for a clone operation, which is not correct because at that point the current file size in the receiver is 384K, which corresponds to the end of last processed extent (at file offset 256K), so using a clone source range from 256K to 256K + 320K is invalid because that goes past the current size of the file (384K) - this makes the receiver get an -EINVAL error when attempting the clone operation. So fix this by excluding clone sources that have a range that goes beyond the current file size in the receiver when iterating extent backreferences. A test case for fstests follows soon. Fixes: 11f2069c113e02 ("Btrfs: send, allow clone operations within the same file") CC: stable@vger.kernel.org # 5.5+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 48d028b1cfafde..b0e5dfb9be7ab9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1270,7 +1270,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * destination of the stream. */ if (ino == bctx->cur_objectid && - offset >= bctx->sctx->cur_inode_next_write_offset) + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } From 6a0f5da2db3b47fb5adaa6d0833f80b75e4d46a8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 23 Oct 2019 21:57:26 +0800 Subject: [PATCH 28/86] btrfs: volumes: Use more straightforward way to calculate map length commit 2d974619a77f106f3d1341686dea95c0eaad601f upstream. The old code goes: offset = logical - em->start; length = min_t(u64, em->len - offset, length); Where @length calculation is dependent on offset, it can take reader several more seconds to find it's just the same code as: offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); Use above code to make the length calculate independent from other variable, thus slightly increase the readability. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4ecd6663dfb515..0fff1441193711 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5714,7 +5714,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } offset = logical - em->start; - length = min_t(u64, em->len - offset, length); + length = min_t(u64, em->start + em->len - logical, length); stripe_len = map->stripe_len; /* From 1f90600e259bff46f0195df30c7a0571ea999a45 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 23 Oct 2019 21:57:27 +0800 Subject: [PATCH 29/86] btrfs: Ensure we trim ranges across block group boundary commit 6b7faadd985c990324b5b5bd18cc4ba5c395eb65 upstream. [BUG] When deleting large files (which cross block group boundary) with discard mount option, we find some btrfs_discard_extent() calls only trimmed part of its space, not the whole range: btrfs_discard_extent: type=0x1 start=19626196992 len=2144530432 trimmed=1073741824 ratio=50% type: bbio->map_type, in above case, it's SINGLE DATA. start: Logical address of this trim len: Logical length of this trim trimmed: Physically trimmed bytes ratio: trimmed / len Thus leaving some unused space not discarded. [CAUSE] When discard mount option is specified, after a transaction is fully committed (super block written to disk), we begin to cleanup pinned extents in the following call chain: btrfs_commit_transaction() |- btrfs_finish_extent_commit() |- find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); |- btrfs_discard_extent() However, pinned extents are recorded in an extent_io_tree, which can merge adjacent extent states. When a large file gets deleted and it has adjacent file extents across block group boundary, we will get a large merged range like this: |<--- BG1 --->|<--- BG2 --->| |//////|<-- Range to discard --->|/////| To discard that range, we have the following calls: btrfs_discard_extent() |- btrfs_map_block() | Returned bbio will end at BG1's end. As btrfs_map_block() | never returns result across block group boundary. |- btrfs_issuse_discard() Issue discard for each stripe. So we will only discard the range in BG1, not the remaining part in BG2. Furthermore, this bug is not that reliably observed, for above case, if there is no other extent in BG2, BG2 will be empty and btrfs will trim all space of BG2, covering up the bug. [FIX] - Allow __btrfs_map_block_for_discard() to modify @length parameter btrfs_map_block() uses its @length paramter to notify the caller how many bytes are mapped in current call. With __btrfs_map_block_for_discard() also modifing the @length, btrfs_discard_extent() now understands when to do extra trim. - Call btrfs_map_block() in a loop until we hit the range end Since we now know how many bytes are mapped each time, we can iterate through each block group boundary and issue correct trim for each range. Reviewed-by: Filipe Manana Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 41 +++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.c | 6 ++++-- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7658f3193175b0..388449101705ef 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1305,8 +1305,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { - int ret; + int ret = 0; u64 discarded_bytes = 0; + u64 end = bytenr + num_bytes; + u64 cur = bytenr; struct btrfs_bio *bbio = NULL; @@ -1315,15 +1317,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, - &bbio, 0); - /* Error condition is -ENOMEM */ - if (!ret) { - struct btrfs_bio_stripe *stripe = bbio->stripes; + while (cur < end) { + struct btrfs_bio_stripe *stripe; int i; + num_bytes = end - cur; + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, + &num_bytes, &bbio, 0); + /* + * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or + * -EOPNOTSUPP. For any such error, @num_bytes is not updated, + * thus we can't continue anyway. + */ + if (ret < 0) + goto out; + stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; struct request_queue *req_q; @@ -1340,10 +1350,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe->physical, stripe->length, &bytes); - if (!ret) + if (!ret) { discarded_bytes += bytes; - else if (ret != -EOPNOTSUPP) - break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + } else if (ret != -EOPNOTSUPP) { + /* + * Logic errors or -ENOMEM, or -EIO, but + * unlikely to happen. + * + * And since there are two loops, explicitly + * go to out to avoid confusion. + */ + btrfs_put_bbio(bbio); + goto out; + } /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1353,7 +1372,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, ret = 0; } btrfs_put_bbio(bbio); + cur += num_bytes; } +out: btrfs_bio_counter_dec(fs_info); if (actual_bytes) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0fff1441193711..e798caee978e70 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5676,12 +5676,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) * replace. */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, + u64 logical, u64 *length_ret, struct btrfs_bio **bbio_ret) { struct extent_map *em; struct map_lookup *map; struct btrfs_bio *bbio; + u64 length = *length_ret; u64 offset; u64 stripe_nr; u64 stripe_nr_end; @@ -5715,6 +5716,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; stripe_len = map->stripe_len; /* @@ -6129,7 +6131,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - *length, bbio_ret); + length, bbio_ret); ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); if (ret < 0) From e531fd7f8b3a9feefcad7873e972fccb123c4b07 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 18:49:13 +0100 Subject: [PATCH 30/86] btrfs: fix RWF_NOWAIT write not failling when we need to cow commit 260a63395f90f67d6ab89e4266af9e3dc34a77e9 upstream. If we attempt to do a RWF_NOWAIT write against a file range for which we can only do NOCOW for a part of it, due to the existence of holes or shared extents for example, we proceed with the write as if it were possible to NOCOW the whole range. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ touch /mnt/sdj/bar $ chattr +C /mnt/sdj/bar $ xfs_io -d -c "pwrite -S 0xab -b 256K 0 256K" /mnt/bar wrote 262144/262144 bytes at offset 0 256 KiB, 1 ops; 0.0003 sec (694.444 MiB/sec and 2777.7778 ops/sec) $ xfs_io -c "fpunch 64K 64K" /mnt/bar $ sync $ xfs_io -d -c "pwrite -N -V 1 -b 128K -S 0xfe 0 128K" /mnt/bar wrote 131072/131072 bytes at offset 0 128 KiB, 1 ops; 0.0007 sec (160.051 MiB/sec and 1280.4097 ops/sec) This last write should fail with -EAGAIN since the file range from 64K to 128K is a hole. On xfs it fails, as expected, but on ext4 it currently succeeds because apparently it is expensive to check if there are extents allocated for the whole range, but I'll check with the ext4 people. Fix the issue by checking if check_can_nocow() returns a number of NOCOW'able bytes smaller then the requested number of bytes, and if it does return -EAGAIN. Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/file.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4e4ddd5629e555..4d0e1f9452a532 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1919,13 +1919,27 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, pos = iocb->ki_pos; count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { + size_t nocow_bytes = count; + /* * We will allocate space in case nodatacow is not set, * so bail */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + + /* check_can_nocow() locks the snapshot lock on success */ + btrfs_end_write_no_snapshotting(root); + /* + * There are holes in the range or parts of the range that must + * be COWed (shared extents, RO block groups, etc), so just bail + * out. + */ + if (nocow_bytes < count) { inode_unlock(inode); return -EAGAIN; } From c0f3c53869951431f7aa14849b2aca1836e31209 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jun 2020 07:23:50 +0800 Subject: [PATCH 31/86] btrfs: allow btrfs_truncate_block() to fallback to nocow for data space reservation commit 6d4572a9d71d5fc2affee0258d8582d39859188c upstream. [BUG] When the data space is exhausted, even if the inode has NOCOW attribute, we will still refuse to truncate unaligned range due to ENOSPC. The following script can reproduce it pretty easily: #!/bin/bash dev=/dev/test/test mnt=/mnt/btrfs umount $dev &> /dev/null umount $mnt &> /dev/null mkfs.btrfs -f $dev -b 1G mount -o nospace_cache $dev $mnt touch $mnt/foobar chattr +C $mnt/foobar xfs_io -f -c "pwrite -b 4k 0 4k" $mnt/foobar > /dev/null xfs_io -f -c "pwrite -b 4k 0 1G" $mnt/padding &> /dev/null sync xfs_io -c "fpunch 0 2k" $mnt/foobar umount $mnt Currently this will fail at the fpunch part. [CAUSE] Because btrfs_truncate_block() always reserves space without checking the NOCOW attribute. Since the writeback path follows NOCOW bit, we only need to bother the space reservation code in btrfs_truncate_block(). [FIX] Make btrfs_truncate_block() follow btrfs_buffered_write() to try to reserve data space first, and fall back to NOCOW check only when we don't have enough space. Such always-try-reserve is an optimization introduced in btrfs_buffered_write(), to avoid expensive btrfs_check_can_nocow() call. This patch will export check_can_nocow() as btrfs_check_can_nocow(), and use it in btrfs_truncate_block() to fix the problem. Reported-by: Martin Doucha Reviewed-by: Filipe Manana Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/file.c | 9 +++++---- fs/btrfs/inode.c | 44 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a690c10afaa0c..23b4f38e239282 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2956,6 +2956,8 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4d0e1f9452a532..4126513e2429cc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1546,8 +1546,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1647,7 +1647,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, + btrfs_check_can_nocow(BTRFS_I(inode), pos, &write_bytes) > 0) { /* * For nodata cow case, no need to reserve @@ -1927,7 +1927,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, + &nocow_bytes) <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 182e93a5b11d53..67b49b94c9cd6e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5133,11 +5133,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -5149,11 +5151,27 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) - goto out; + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize); + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + btrfs_check_can_nocow(BTRFS_I(inode), block_start, + &write_bytes) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); + goto out; + } again: page = find_or_create_page(mapping, index, mask); if (!page) { @@ -5222,14 +5240,26 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); + out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: + if (only_release_metadata) + btrfs_end_write_no_snapshotting(BTRFS_I(inode)->root); extent_changeset_free(data_reserved); return ret; } From 20f96fee81c632a8c5e9a0f055b0d5f26b62efef Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 6 Oct 2020 16:36:47 -0700 Subject: [PATCH 32/86] nvme-core: put ctrl ref when module ref get fail commit 4bab69093044ca81f394bd0780be1b71c5a4d308 upstream. When try_module_get() fails in the nvme_dev_open() it returns without releasing the ctrl reference which was taken earlier. Put the ctrl reference which is taken before calling the try_module_get() in the error return code path. Fixes: 52a3974feb1a "nvme-core: get/put ctrl and transport module in nvme_dev_open/release()" Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 207ed6d49ad7c1..ce69aaea581a50 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2932,8 +2932,10 @@ static int nvme_dev_open(struct inode *inode, struct file *file) } nvme_get_ctrl(ctrl); - if (!try_module_get(ctrl->ops->module)) + if (!try_module_get(ctrl->ops->module)) { + nvme_put_ctrl(ctrl); return -EINVAL; + } file->private_data = ctrl; return 0; From 1317469fa05bc01da2da7fb78da51a51ed7a8a86 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Oct 2020 01:42:46 -0700 Subject: [PATCH 33/86] macsec: avoid use-after-free in macsec_handle_frame() commit c7cc9200e9b4a2ac172e990ef1975cd42975dad6 upstream. De-referencing skb after call to gro_cells_receive() is not allowed. We need to fetch skb->len earlier. Fixes: 5491e7c6b1a9 ("macsec: enable GRO and RPS on macsec devices") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/macsec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 4c86a73db475a3..f233a6933a9760 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1080,6 +1080,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) struct macsec_rx_sa *rx_sa; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; + unsigned int len; sci_t sci; u32 pn; bool cbit; @@ -1236,9 +1237,10 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) macsec_rxsc_put(rx_sc); skb_orphan(skb); + len = skb->len; ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) - count_rx(dev, skb->len); + count_rx(dev, len); else macsec->secy.netdev->stats.rx_dropped++; From 5c62d335317cb9c25273a46fa80a42a6c8dd0108 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 9 Oct 2020 20:07:59 -0700 Subject: [PATCH 34/86] mm/khugepaged: fix filemap page_to_pgoff(page) != offset commit 033b5d77551167f8c24ca862ce83d3e0745f9245 upstream. There have been elusive reports of filemap_fault() hitting its VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page) on kernels built with CONFIG_READ_ONLY_THP_FOR_FS=y. Suren has hit it on a kernel with CONFIG_READ_ONLY_THP_FOR_FS=y and CONFIG_NUMA is not set: and he has analyzed it down to how khugepaged without NUMA reuses the same huge page after collapse_file() failed (whereas NUMA targets its allocation to the respective node each time). And most of us were usually testing with CONFIG_NUMA=y kernels. collapse_file(old start) new_page = khugepaged_alloc_page(hpage) __SetPageLocked(new_page) new_page->index = start // hpage->index=old offset new_page->mapping = mapping xas_store(&xas, new_page) filemap_fault page = find_get_page(mapping, offset) // if offset falls inside hpage then // compound_head(page) == hpage lock_page_maybe_drop_mmap() __lock_page(page) // collapse fails xas_store(&xas, old page) new_page->mapping = NULL unlock_page(new_page) collapse_file(new start) new_page = khugepaged_alloc_page(hpage) __SetPageLocked(new_page) new_page->index = start // hpage->index=new offset new_page->mapping = mapping // mapping becomes valid again // since compound_head(page) == hpage // page_to_pgoff(page) got changed VM_BUG_ON_PAGE(page_to_pgoff(page) != offset) An initial patch replaced __SetPageLocked() by lock_page(), which did fix the race which Suren illustrates above. But testing showed that it's not good enough: if the racing task's __lock_page() gets delayed long after its find_get_page(), then it may follow collapse_file(new start)'s successful final unlock_page(), and crash on the same VM_BUG_ON_PAGE. It could be fixed by relaxing filemap_fault()'s VM_BUG_ON_PAGE to a check and retry (as is done for mapping), with similar relaxations in find_lock_entry() and pagecache_get_page(): but it's not obvious what else might get caught out; and khugepaged non-NUMA appears to be unique in exposing a page to page cache, then revoking, without going through a full cycle of freeing before reuse. Instead, non-NUMA khugepaged_prealloc_page() release the old page if anyone else has a reference to it (1% of cases when I tested). Although never reported on huge tmpfs, I believe its find_lock_entry() has been at similar risk; but huge tmpfs does not rely on khugepaged for its normal working nearly so much as READ_ONLY_THP_FOR_FS does. Reported-by: Denis Lisov Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206569 Link: https://lore.kernel.org/linux-mm/?q=20200219144635.3b7417145de19b65f258c943%40linux-foundation.org Reported-by: Qian Cai Link: https://lore.kernel.org/linux-xfs/?q=20200616013309.GB815%40lca.pw Reported-and-analyzed-by: Suren Baghdasaryan Fixes: 87c460a0bded ("mm/khugepaged: collapse_shmem() without freezing new_page") Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org # v4.9+ Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/khugepaged.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9ec618d5ea5577..be897e45effbfa 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -832,6 +832,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { + /* + * If the hpage allocated earlier was briefly exposed in page cache + * before collapse_file() failed, it is possible that racing lookups + * have not yet completed, and would then be unpleasantly surprised by + * finding the hpage reused for the same mapping at a different offset. + * Just release the previous allocation if there is any danger of that. + */ + if (*hpage && page_count(*hpage) > 1) { + put_page(*hpage); + *hpage = NULL; + } + if (!*hpage) *hpage = khugepaged_alloc_hugepage(wait); From d23dd3864b4c9231b4281d7894848362fe2fc06c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 2 Oct 2020 16:27:28 +0800 Subject: [PATCH 35/86] net: introduce helper sendpage_ok() in include/linux/net.h commit c381b07941adc2274ce552daf86c94701c5e265a upstream. The original problem was from nvme-over-tcp code, who mistakenly uses kernel_sendpage() to send pages allocated by __get_free_pages() without __GFP_COMP flag. Such pages don't have refcount (page_count is 0) on tail pages, sending them by kernel_sendpage() may trigger a kernel panic from a corrupted kernel heap, because these pages are incorrectly freed in network stack as page_count 0 pages. This patch introduces a helper sendpage_ok(), it returns true if the checking page, - is not slab page: PageSlab(page) is false. - has page refcount: page_count(page) is not zero All drivers who want to send page to remote end by kernel_sendpage() may use this helper to check whether the page is OK. If the helper does not return true, the driver should try other non sendpage method (e.g. sock_no_sendpage()) to handle the page. Signed-off-by: Coly Li Cc: Chaitanya Kulkarni Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Jan Kara Cc: Jens Axboe Cc: Mikhail Skorzhinskii Cc: Philipp Reisner Cc: Sagi Grimberg Cc: Vlastimil Babka Cc: stable@vger.kernel.org Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/net.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/net.h b/include/linux/net.h index 9cafb5f353a97c..47dd7973ae9bca 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -288,6 +289,21 @@ do { \ #define net_get_random_once_wait(buf, nbytes) \ get_random_once_wait((buf), (nbytes)) +/* + * E.g. XFS meta- & log-data is in slab pages, or bcache meta + * data pages, or other high order pages allocated by + * __get_free_pages() without __GFP_COMP, which have a page_count + * of 0 and/or have PageSlab() set. We cannot use send_page for + * those, as that does get_page(); put_page(); and would cause + * either a VM_BUG directly, or __page_cache_release a page that + * would actually still be referenced by someone, leading to some + * obscure delayed Oops somewhere else. + */ +static inline bool sendpage_ok(struct page *page) +{ + return !PageSlab(page) && page_count(page) >= 1; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, From 15cac17d9d3906c098128e8ee3a9d8721991677a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 2 Oct 2020 16:27:31 +0800 Subject: [PATCH 36/86] tcp: use sendpage_ok() to detect misused .sendpage commit cf83a17edeeb36195596d2dae060a7c381db35f1 upstream. commit a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab objects") adds the checks for Slab pages, but the pages don't have page_count are still missing from the check. Network layer's sendpage method is not designed to send page_count 0 pages neither, therefore both PageSlab() and page_count() should be both checked for the sending page. This is exactly what sendpage_ok() does. This patch uses sendpage_ok() in do_tcp_sendpages() to detect misused .sendpage, to make the code more robust. Fixes: a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab objects") Suggested-by: Eric Dumazet Signed-off-by: Coly Li Cc: Vasily Averin Cc: David S. Miller Cc: stable@vger.kernel.org Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2ffa33b5ef404b..97f2b11ce20348 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -971,7 +971,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (IS_ENABLED(CONFIG_DEBUG_VM) && - WARN_ONCE(PageSlab(page), "page must not be a Slab one")) + WARN_ONCE(!sendpage_ok(page), + "page must not be a Slab one and have page_count > 0")) return -EINVAL; /* Wait for a connection to finish. One exception is TCP Fast Open From 49af88ac6534f4d31dbd6ca157c38c37d3a72004 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 2 Oct 2020 16:27:30 +0800 Subject: [PATCH 37/86] nvme-tcp: check page by sendpage_ok() before calling kernel_sendpage() commit 7d4194abfc4de13a2663c7fee6891de8360f7a52 upstream. Currently nvme_tcp_try_send_data() doesn't use kernel_sendpage() to send slab pages. But for pages allocated by __get_free_pages() without __GFP_COMP, which also have refcount as 0, they are still sent by kernel_sendpage() to remote end, this is problematic. The new introduced helper sendpage_ok() checks both PageSlab tag and page_count counter, and returns true if the checking page is OK to be sent by kernel_sendpage(). This patch fixes the page checking issue of nvme_tcp_try_send_data() with sendpage_ok(). If sendpage_ok() returns true, send this page by kernel_sendpage(), otherwise use sock_no_sendpage to handle this page. Signed-off-by: Coly Li Cc: Chaitanya Kulkarni Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Jan Kara Cc: Jens Axboe Cc: Mikhail Skorzhinskii Cc: Philipp Reisner Cc: Sagi Grimberg Cc: Vlastimil Babka Cc: stable@vger.kernel.org Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/tcp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6d7a813e7183ab..e159b78b5f3b4e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -861,12 +861,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else flags |= MSG_MORE; - /* can't zcopy slab pages */ - if (unlikely(PageSlab(page))) { - ret = sock_no_sendpage(queue->sock, page, offset, len, + if (sendpage_ok(page)) { + ret = kernel_sendpage(queue->sock, page, offset, len, flags); } else { - ret = kernel_sendpage(queue->sock, page, offset, len, + ret = sock_no_sendpage(queue->sock, page, offset, len, flags); } if (ret <= 0) From 0afdda28eb2b97997b0a66d380c066825d63791a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 4 Aug 2020 11:37:29 +0200 Subject: [PATCH 38/86] xfrmi: drop ignore_df check before updating pmtu commit 45a36a18d01907710bad5258d81f76c18882ad88 upstream. xfrm interfaces currently test for !skb->ignore_df when deciding whether to update the pmtu on the skb's dst. Because of this, no pmtu exception is created when we do something like: ping -s 1438 By dropping this check, the pmtu exception will be created and the next ping attempt will work. Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 0ab2b35c95deb6..00af31d3e77440 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -293,7 +293,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } mtu = dst_mtu(dst); - if (!skb->ignore_df && skb->len > mtu) { + if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { From 209549c1c0f069c5207e07b8b82761f0d06baaae Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Sat, 10 Oct 2020 21:25:54 +0300 Subject: [PATCH 39/86] cifs: Fix incomplete memory allocation on setxattr path commit 64b7f674c292207624b3d788eda2dde3dc1415df upstream. On setxattr() syscall path due to an apprent typo the size of a dynamically allocated memory chunk for storing struct smb2_file_full_ea_info object is computed incorrectly, to be more precise the first addend is the size of a pointer instead of the wanted object size. Coincidentally it makes no difference on 64-bit platforms, however on 32-bit targets the following memcpy() writes 4 bytes of data outside of the dynamically allocated memory. ============================================================================= BUG kmalloc-16 (Not tainted): Redzone overwritten ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: 0x79e69a6f-0x9e5cdecf @offset=368. First byte 0x73 instead of 0xcc INFO: Slab 0xd36d2454 objects=85 used=51 fp=0xf7d0fc7a flags=0x35000201 INFO: Object 0x6f171df3 @offset=352 fp=0x00000000 Redzone 5d4ff02d: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Object 6f171df3: 00 00 00 00 00 05 06 00 73 6e 72 75 62 00 66 69 ........snrub.fi Redzone 79e69a6f: 73 68 32 0a sh2. Padding 56254d82: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ CPU: 0 PID: 8196 Comm: attr Tainted: G B 5.9.0-rc8+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: dump_stack+0x54/0x6e print_trailer+0x12c/0x134 check_bytes_and_report.cold+0x3e/0x69 check_object+0x18c/0x250 free_debug_processing+0xfe/0x230 __slab_free+0x1c0/0x300 kfree+0x1d3/0x220 smb2_set_ea+0x27d/0x540 cifs_xattr_set+0x57f/0x620 __vfs_setxattr+0x4e/0x60 __vfs_setxattr_noperm+0x4e/0x100 __vfs_setxattr_locked+0xae/0xd0 vfs_setxattr+0x4e/0xe0 setxattr+0x12c/0x1a0 path_setxattr+0xa4/0xc0 __ia32_sys_lsetxattr+0x1d/0x20 __do_fast_syscall_32+0x40/0x70 do_fast_syscall_32+0x29/0x60 do_SYSENTER_32+0x15/0x20 entry_SYSENTER_32+0x9f/0xf2 Fixes: 5517554e4313 ("cifs: Add support for writing attributes on SMB2+") Signed-off-by: Vladimir Zapolskiy Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 64ad466695c550..9a89e5f7c4da3c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1179,7 +1179,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = si_iov; rqst[1].rq_nvec = 1; - len = sizeof(ea) + ea_name_len + ea_value_len + 1; + len = sizeof(*ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { rc = -ENOMEM; From 6db69c39062285d0cc1cf99795b7d82bae3ad57d Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 7 Oct 2020 10:07:49 +0200 Subject: [PATCH 40/86] i2c: meson: fix clock setting overwrite commit 28683e847e2f20eed22cdd24f185d7783db396d3 upstream. When the slave address is written in do_start(), SLAVE_ADDR is written completely. This may overwrite some setting related to the clock rate or signal filtering. Fix this by writing only the bits related to slave address. To avoid causing unexpected changed, explicitly disable filtering or high/low clock mode which may have been left over by the bootloader. Fixes: 30021e3707a7 ("i2c: add support for Amlogic Meson I2C controller") Signed-off-by: Jerome Brunet Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-meson.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c index 1e2647f9a2a72f..62aaeb4d5978d2 100644 --- a/drivers/i2c/busses/i2c-meson.c +++ b/drivers/i2c/busses/i2c-meson.c @@ -5,6 +5,7 @@ * Copyright (C) 2014 Beniamino Galvani */ +#include #include #include #include @@ -37,6 +38,12 @@ #define REG_CTRL_CLKDIVEXT_SHIFT 28 #define REG_CTRL_CLKDIVEXT_MASK GENMASK(29, 28) +#define REG_SLV_ADDR GENMASK(7, 0) +#define REG_SLV_SDA_FILTER GENMASK(10, 8) +#define REG_SLV_SCL_FILTER GENMASK(13, 11) +#define REG_SLV_SCL_LOW GENMASK(27, 16) +#define REG_SLV_SCL_LOW_EN BIT(28) + #define I2C_TIMEOUT_MS 500 enum { @@ -146,6 +153,9 @@ static void meson_i2c_set_clk_div(struct meson_i2c *i2c, unsigned int freq) meson_i2c_set_mask(i2c, REG_CTRL, REG_CTRL_CLKDIVEXT_MASK, (div >> 10) << REG_CTRL_CLKDIVEXT_SHIFT); + /* Disable HIGH/LOW mode */ + meson_i2c_set_mask(i2c, REG_SLAVE_ADDR, REG_SLV_SCL_LOW_EN, 0); + dev_dbg(i2c->dev, "%s: clk %lu, freq %u, div %u\n", __func__, clk_rate, freq, div); } @@ -273,7 +283,10 @@ static void meson_i2c_do_start(struct meson_i2c *i2c, struct i2c_msg *msg) token = (msg->flags & I2C_M_RD) ? TOKEN_SLAVE_ADDR_READ : TOKEN_SLAVE_ADDR_WRITE; - writel(msg->addr << 1, i2c->regs + REG_SLAVE_ADDR); + + meson_i2c_set_mask(i2c, REG_SLAVE_ADDR, REG_SLV_ADDR, + FIELD_PREP(REG_SLV_ADDR, msg->addr << 1)); + meson_i2c_add_token(i2c, TOKEN_START); meson_i2c_add_token(i2c, token); } @@ -432,6 +445,10 @@ static int meson_i2c_probe(struct platform_device *pdev) return ret; } + /* Disable filtering */ + meson_i2c_set_mask(i2c, REG_SLAVE_ADDR, + REG_SLV_SDA_FILTER | REG_SLV_SCL_FILTER, 0); + meson_i2c_set_clk_div(i2c, timings.bus_freq_hz); return 0; From abe997f632d1d87d4d4880d57fdf944c7eaf4b83 Mon Sep 17 00:00:00 2001 From: Nicolas Belin Date: Wed, 7 Oct 2020 10:07:51 +0200 Subject: [PATCH 41/86] i2c: meson: fixup rate calculation with filter delay commit 1334d3b4e49e35d8912a7c37ffca4c5afb9a0516 upstream. Apparently, 15 cycles of the peripheral clock are used by the controller for sampling and filtering. Because this was not known before, the rate calculation is slightly off. Clean up and fix the calculation taking this filtering delay into account. Fixes: 30021e3707a7 ("i2c: add support for Amlogic Meson I2C controller") Signed-off-by: Nicolas Belin Signed-off-by: Jerome Brunet Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-meson.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c index 62aaeb4d5978d2..6661481f125cdd 100644 --- a/drivers/i2c/busses/i2c-meson.c +++ b/drivers/i2c/busses/i2c-meson.c @@ -33,10 +33,8 @@ #define REG_CTRL_ACK_IGNORE BIT(1) #define REG_CTRL_STATUS BIT(2) #define REG_CTRL_ERROR BIT(3) -#define REG_CTRL_CLKDIV_SHIFT 12 -#define REG_CTRL_CLKDIV_MASK GENMASK(21, 12) -#define REG_CTRL_CLKDIVEXT_SHIFT 28 -#define REG_CTRL_CLKDIVEXT_MASK GENMASK(29, 28) +#define REG_CTRL_CLKDIV GENMASK(21, 12) +#define REG_CTRL_CLKDIVEXT GENMASK(29, 28) #define REG_SLV_ADDR GENMASK(7, 0) #define REG_SLV_SDA_FILTER GENMASK(10, 8) @@ -45,6 +43,7 @@ #define REG_SLV_SCL_LOW_EN BIT(28) #define I2C_TIMEOUT_MS 500 +#define FILTER_DELAY 15 enum { TOKEN_END = 0, @@ -139,19 +138,21 @@ static void meson_i2c_set_clk_div(struct meson_i2c *i2c, unsigned int freq) unsigned long clk_rate = clk_get_rate(i2c->clk); unsigned int div; - div = DIV_ROUND_UP(clk_rate, freq * i2c->data->div_factor); + div = DIV_ROUND_UP(clk_rate, freq); + div -= FILTER_DELAY; + div = DIV_ROUND_UP(div, i2c->data->div_factor); /* clock divider has 12 bits */ - if (div >= (1 << 12)) { + if (div > GENMASK(11, 0)) { dev_err(i2c->dev, "requested bus frequency too low\n"); - div = (1 << 12) - 1; + div = GENMASK(11, 0); } - meson_i2c_set_mask(i2c, REG_CTRL, REG_CTRL_CLKDIV_MASK, - (div & GENMASK(9, 0)) << REG_CTRL_CLKDIV_SHIFT); + meson_i2c_set_mask(i2c, REG_CTRL, REG_CTRL_CLKDIV, + FIELD_PREP(REG_CTRL_CLKDIV, div & GENMASK(9, 0))); - meson_i2c_set_mask(i2c, REG_CTRL, REG_CTRL_CLKDIVEXT_MASK, - (div >> 10) << REG_CTRL_CLKDIVEXT_SHIFT); + meson_i2c_set_mask(i2c, REG_CTRL, REG_CTRL_CLKDIVEXT, + FIELD_PREP(REG_CTRL_CLKDIVEXT, div >> 10)); /* Disable HIGH/LOW mode */ meson_i2c_set_mask(i2c, REG_SLAVE_ADDR, REG_SLV_SCL_LOW_EN, 0); From 040e3110d49cf20a03458b36051e7fb21b19b5c0 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 9 Oct 2020 00:44:39 +0300 Subject: [PATCH 42/86] i2c: owl: Clear NACK and BUS error bits commit f5b3f433641c543ebe5171285a42aa6adcdb2d22 upstream. When the NACK and BUS error bits are set by the hardware, the driver is responsible for clearing them by writing "1" into the corresponding status registers. Hence perform the necessary operations in owl_i2c_interrupt(). Fixes: d211e62af466 ("i2c: Add Actions Semiconductor Owl family S900 I2C driver") Reported-by: Manivannan Sadhasivam Signed-off-by: Cristian Ciocaltea Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-owl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/busses/i2c-owl.c b/drivers/i2c/busses/i2c-owl.c index b6b5a495118b6d..a567fd2b295e19 100644 --- a/drivers/i2c/busses/i2c-owl.c +++ b/drivers/i2c/busses/i2c-owl.c @@ -179,6 +179,9 @@ static irqreturn_t owl_i2c_interrupt(int irq, void *_dev) fifostat = readl(i2c_dev->base + OWL_I2C_REG_FIFOSTAT); if (fifostat & OWL_I2C_FIFOSTAT_RNB) { i2c_dev->err = -ENXIO; + /* Clear NACK error bit by writing "1" */ + owl_i2c_update_reg(i2c_dev->base + OWL_I2C_REG_FIFOSTAT, + OWL_I2C_FIFOSTAT_RNB, true); goto stop; } @@ -186,6 +189,9 @@ static irqreturn_t owl_i2c_interrupt(int irq, void *_dev) stat = readl(i2c_dev->base + OWL_I2C_REG_STAT); if (stat & OWL_I2C_STAT_BEB) { i2c_dev->err = -EIO; + /* Clear BUS error bit by writing "1" */ + owl_i2c_update_reg(i2c_dev->base + OWL_I2C_REG_STAT, + OWL_I2C_STAT_BEB, true); goto stop; } From fb3681c20fbfb990860cb9a19fbe96882298c21a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2020 01:38:31 -0700 Subject: [PATCH 43/86] sctp: fix sctp_auth_init_hmacs() error path commit d42ee76ecb6c49d499fc5eb32ca34468d95dbc3e upstream. After freeing ep->auth_hmacs we have to clear the pointer or risk use-after-free as reported by syzbot: BUG: KASAN: use-after-free in sctp_auth_destroy_hmacs net/sctp/auth.c:509 [inline] BUG: KASAN: use-after-free in sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] BUG: KASAN: use-after-free in sctp_auth_free+0x17e/0x1d0 net/sctp/auth.c:1070 Read of size 8 at addr ffff8880a8ff52c0 by task syz-executor941/6874 CPU: 0 PID: 6874 Comm: syz-executor941 Not tainted 5.9.0-rc8-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 sctp_auth_destroy_hmacs net/sctp/auth.c:509 [inline] sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] sctp_auth_free+0x17e/0x1d0 net/sctp/auth.c:1070 sctp_endpoint_destroy+0x95/0x240 net/sctp/endpointola.c:203 sctp_endpoint_put net/sctp/endpointola.c:236 [inline] sctp_endpoint_free+0xd6/0x110 net/sctp/endpointola.c:183 sctp_destroy_sock+0x9c/0x3c0 net/sctp/socket.c:4981 sctp_v6_destroy_sock+0x11/0x20 net/sctp/socket.c:9415 sk_common_release+0x64/0x390 net/core/sock.c:3254 sctp_close+0x4ce/0x8b0 net/sctp/socket.c:1533 inet_release+0x12e/0x280 net/ipv4/af_inet.c:431 inet6_release+0x4c/0x70 net/ipv6/af_inet6.c:475 __sock_release+0xcd/0x280 net/socket.c:596 sock_close+0x18/0x20 net/socket.c:1277 __fput+0x285/0x920 fs/file_table.c:281 task_work_run+0xdd/0x190 kernel/task_work.c:141 exit_task_work include/linux/task_work.h:25 [inline] do_exit+0xb7d/0x29f0 kernel/exit.c:806 do_group_exit+0x125/0x310 kernel/exit.c:903 __do_sys_exit_group kernel/exit.c:914 [inline] __se_sys_exit_group kernel/exit.c:912 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:912 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x43f278 Code: Bad RIP value. RSP: 002b:00007fffe0995c38 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000043f278 RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000 RBP: 00000000004bf068 R08: 00000000000000e7 R09: ffffffffffffffd0 R10: 0000000020000000 R11: 0000000000000246 R12: 0000000000000001 R13: 00000000006d1180 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 6874: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:461 kmem_cache_alloc_trace+0x174/0x300 mm/slab.c:3554 kmalloc include/linux/slab.h:554 [inline] kmalloc_array include/linux/slab.h:593 [inline] kcalloc include/linux/slab.h:605 [inline] sctp_auth_init_hmacs+0xdb/0x3b0 net/sctp/auth.c:464 sctp_auth_init+0x8a/0x4a0 net/sctp/auth.c:1049 sctp_setsockopt_auth_supported net/sctp/socket.c:4354 [inline] sctp_setsockopt+0x477e/0x97f0 net/sctp/socket.c:4631 __sys_setsockopt+0x2db/0x610 net/socket.c:2132 __do_sys_setsockopt net/socket.c:2143 [inline] __se_sys_setsockopt net/socket.c:2140 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2140 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 6874: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0xd8/0x120 mm/kasan/common.c:422 __cache_free mm/slab.c:3422 [inline] kfree+0x10e/0x2b0 mm/slab.c:3760 sctp_auth_destroy_hmacs net/sctp/auth.c:511 [inline] sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] sctp_auth_init_hmacs net/sctp/auth.c:496 [inline] sctp_auth_init_hmacs+0x2b7/0x3b0 net/sctp/auth.c:454 sctp_auth_init+0x8a/0x4a0 net/sctp/auth.c:1049 sctp_setsockopt_auth_supported net/sctp/socket.c:4354 [inline] sctp_setsockopt+0x477e/0x97f0 net/sctp/socket.c:4631 __sys_setsockopt+0x2db/0x610 net/socket.c:2132 __do_sys_setsockopt net/socket.c:2143 [inline] __se_sys_setsockopt net/socket.c:2140 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2140 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1f485649f529 ("[SCTP]: Implement SCTP-AUTH internals") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Cc: Neil Horman Cc: Marcelo Ricardo Leitner Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sctp/auth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 4278764d82b827..1d898ee4018c92 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -494,6 +494,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) out_err: /* Clean up any successful allocations */ sctp_auth_destroy_hmacs(ep->auth_hmacs); + ep->auth_hmacs = NULL; return -ENOMEM; } From 003269d8d6de1762a0ae386a2ad31fb1155cdf44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2020 06:38:08 -0700 Subject: [PATCH 44/86] team: set dev->needed_headroom in team_setup_by_port() commit 89d01748b2354e210b5d4ea47bc25a42a1b42c82 upstream. Some devices set needed_headroom. If we ignore it, we might end up crashing in various skb_push() for example in ipgre_header() since some layers assume enough headroom has been reserved. Fixes: 1d76efe1577b ("team: add support for non-ethernet devices") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/team/team.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 04845a4017f935..e1423770f9ce57 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2111,6 +2111,7 @@ static void team_setup_by_port(struct net_device *dev, dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; + dev->needed_headroom = port_dev->needed_headroom; dev->addr_len = port_dev->addr_len; dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); From f89128ad358e016d79490bb641b7f2916aee1e08 Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Mon, 5 Oct 2020 02:25:36 +0530 Subject: [PATCH 45/86] net: team: fix memory leak in __team_options_register commit 9a9e77495958c7382b2438bc19746dd3aaaabb8e upstream. The variable "i" isn't initialized back correctly after the first loop under the label inst_rollback gets executed. The value of "i" is assigned to be option_count - 1, and the ensuing loop (under alloc_rollback) begins by initializing i--. Thus, the value of i when the loop begins execution will now become i = option_count - 2. Thus, when kfree(dst_opts[i]) is called in the second loop in this order, (i.e., inst_rollback followed by alloc_rollback), dst_optsp[option_count - 2] is the first element freed, and dst_opts[option_count - 1] does not get freed, and thus, a memory leak is caused. This memory leak can be fixed, by assigning i = option_count (instead of option_count - 1). Fixes: 80f7c6683fe0 ("team: add support for per-port options") Reported-by: syzbot+69b804437cfec30deac3@syzkaller.appspotmail.com Tested-by: syzbot+69b804437cfec30deac3@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/team/team.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index e1423770f9ce57..606fee99221b84 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -287,7 +287,7 @@ static int __team_options_register(struct team *team, for (i--; i >= 0; i--) __team_option_inst_del_option(team, dst_opts[i]); - i = option_count - 1; + i = option_count; alloc_rollback: for (i--; i >= 0; i--) kfree(dst_opts[i]); From 4034664a733ebfaa7a32a12f49d75afa171ab082 Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Wed, 7 Oct 2020 17:48:03 +0200 Subject: [PATCH 46/86] openvswitch: handle DNAT tuple collision commit 8aa7b526dc0b5dbf40c1b834d76a667ad672a410 upstream. With multiple DNAT rules it's possible that after destination translation the resulting tuples collide. For example, two openvswitch flows: nw_dst=10.0.0.10,tp_dst=10, actions=ct(commit,table=2,nat(dst=20.0.0.1:20)) nw_dst=10.0.0.20,tp_dst=10, actions=ct(commit,table=2,nat(dst=20.0.0.1:20)) Assuming two TCP clients initiating the following connections: 10.0.0.10:5000->10.0.0.10:10 10.0.0.10:5000->10.0.0.20:10 Both tuples would translate to 10.0.0.10:5000->20.0.0.1:20 causing nf_conntrack_confirm() to fail because of tuple collision. Netfilter handles this case by allocating a null binding for SNAT at egress by default. Perform the same operation in openvswitch for DNAT if no explicit SNAT is requested by the user and allocate a null binding for SNAT for packets in the "original" direction. Reported-at: https://bugzilla.redhat.com/1877128 Suggested-by: Florian Westphal Fixes: 05752523e565 ("openvswitch: Interface with NAT.") Signed-off-by: Dumitru Ceara Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/openvswitch/conntrack.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c86e404cd65bb5..d06d7d58eaf271 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -905,15 +905,19 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, } err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); - if (err == NF_ACCEPT && - ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, - maniptype); + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, + maniptype); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL, + NF_NAT_MANIP_SRC); + } } /* Mark NAT done if successful and update the flow key. */ From bdffb36bcd383f7bd7204362d261bf6c84993db9 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 15 Sep 2020 17:07:35 -0400 Subject: [PATCH 47/86] drm/amdgpu: prevent double kfree ttm->sg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1d0e16ac1a9e800598dcfa5b6bc53b704a103390 ] Set ttm->sg to NULL after kfree, to avoid memory corruption backtrace: [ 420.932812] kernel BUG at /build/linux-do9eLF/linux-4.15.0/mm/slub.c:295! [ 420.934182] invalid opcode: 0000 [#1] SMP NOPTI [ 420.935445] Modules linked in: xt_conntrack ipt_MASQUERADE [ 420.951332] Hardware name: Dell Inc. PowerEdge R7525/0PYVT1, BIOS 1.5.4 07/09/2020 [ 420.952887] RIP: 0010:__slab_free+0x180/0x2d0 [ 420.954419] RSP: 0018:ffffbe426291fa60 EFLAGS: 00010246 [ 420.955963] RAX: ffff9e29263e9c30 RBX: ffff9e29263e9c30 RCX: 000000018100004b [ 420.957512] RDX: ffff9e29263e9c30 RSI: fffff3d33e98fa40 RDI: ffff9e297e407a80 [ 420.959055] RBP: ffffbe426291fb00 R08: 0000000000000001 R09: ffffffffc0d39ade [ 420.960587] R10: ffffbe426291fb20 R11: ffff9e49ffdd4000 R12: ffff9e297e407a80 [ 420.962105] R13: fffff3d33e98fa40 R14: ffff9e29263e9c30 R15: ffff9e2954464fd8 [ 420.963611] FS: 00007fa2ea097780(0000) GS:ffff9e297e840000(0000) knlGS:0000000000000000 [ 420.965144] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 420.966663] CR2: 00007f16bfffefb8 CR3: 0000001ff0c62000 CR4: 0000000000340ee0 [ 420.968193] Call Trace: [ 420.969703] ? __page_cache_release+0x3c/0x220 [ 420.971294] ? amdgpu_ttm_tt_unpopulate+0x5e/0x80 [amdgpu] [ 420.972789] kfree+0x168/0x180 [ 420.974353] ? amdgpu_ttm_tt_set_user_pages+0x64/0xc0 [amdgpu] [ 420.975850] ? kfree+0x168/0x180 [ 420.977403] amdgpu_ttm_tt_unpopulate+0x5e/0x80 [amdgpu] [ 420.978888] ttm_tt_unpopulate.part.10+0x53/0x60 [amdttm] [ 420.980357] ttm_tt_destroy.part.11+0x4f/0x60 [amdttm] [ 420.981814] ttm_tt_destroy+0x13/0x20 [amdttm] [ 420.983273] ttm_bo_cleanup_memtype_use+0x36/0x80 [amdttm] [ 420.984725] ttm_bo_release+0x1c9/0x360 [amdttm] [ 420.986167] amdttm_bo_put+0x24/0x30 [amdttm] [ 420.987663] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 420.989165] amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu+0x9ca/0xb10 [amdgpu] [ 420.990666] kfd_ioctl_alloc_memory_of_gpu+0xef/0x2c0 [amdgpu] Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index f15ded1ce9057d..c6a1dfe79e8095 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -967,6 +967,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm) release_sg: kfree(ttm->sg); + ttm->sg = NULL; return r; } From f825fd534f8b4b07cbcfbbb7ad570880ee4a9bb3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 27 Sep 2020 14:24:28 +0800 Subject: [PATCH 48/86] iommu/vt-d: Fix lockdep splat in iommu_flush_dev_iotlb() [ Upstream commit 1a3f2fd7fc4e8f24510830e265de2ffb8e3300d2 ] Lock(&iommu->lock) without disabling irq causes lockdep warnings. [ 12.703950] ======================================================== [ 12.703962] WARNING: possible irq lock inversion dependency detected [ 12.703975] 5.9.0-rc6+ #659 Not tainted [ 12.703983] -------------------------------------------------------- [ 12.703995] systemd-udevd/284 just changed the state of lock: [ 12.704007] ffffffffbd6ff4d8 (device_domain_lock){..-.}-{2:2}, at: iommu_flush_dev_iotlb.part.57+0x2e/0x90 [ 12.704031] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 12.704043] (&iommu->lock){+.+.}-{2:2} [ 12.704045] and interrupts could create inverse lock ordering between them. [ 12.704073] other info that might help us debug this: [ 12.704085] Possible interrupt unsafe locking scenario: [ 12.704097] CPU0 CPU1 [ 12.704106] ---- ---- [ 12.704115] lock(&iommu->lock); [ 12.704123] local_irq_disable(); [ 12.704134] lock(device_domain_lock); [ 12.704146] lock(&iommu->lock); [ 12.704158] [ 12.704164] lock(device_domain_lock); [ 12.704174] *** DEADLOCK *** Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200927062428.13713-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin --- drivers/iommu/intel-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2ffec65df3889e..1147626f0d2535 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2560,14 +2560,14 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, } /* Setup the PASID entry for requests without PASID: */ - spin_lock(&iommu->lock); + spin_lock_irqsave(&iommu->lock, flags); if (hw_pass_through && domain_type_is_si(domain)) ret = intel_pasid_setup_pass_through(iommu, domain, dev, PASID_RID2PASID); else ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); - spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&iommu->lock, flags); if (ret) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); From 0bea401a9a5a05694a55b9754f2ab0dd1fc5c1d0 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 4 Sep 2020 08:49:38 +0200 Subject: [PATCH 49/86] xfrm: clone XFRMA_SET_MARK in xfrm_do_migrate [ Upstream commit 545e5c571662b1cd79d9588f9d3b6e36985b8007 ] XFRMA_SET_MARK and XFRMA_SET_MARK_MASK was not cloned from the old to the new. Migrate these two attributes during XFRMA_MSG_MIGRATE Fixes: 9b42c1f179a6 ("xfrm: Extend the output_mark to support input direction and masking.") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f3423562d93365..10d30f0338d720 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1507,6 +1507,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, } memcpy(&x->mark, &orig->mark, sizeof(x->mark)); + memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); if (xfrm_init_state(x) < 0) goto error; From c1becfebe33efa7dc6dbe974d0c3e1107f15b5db Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 4 Sep 2020 08:49:55 +0200 Subject: [PATCH 50/86] xfrm: clone XFRMA_REPLAY_ESN_VAL in xfrm_do_migrate [ Upstream commit 91a46c6d1b4fcbfa4773df9421b8ad3e58088101 ] XFRMA_REPLAY_ESN_VAL was not cloned completely from the old to the new. Migrate this attribute during XFRMA_MSG_MIGRATE v1->v2: - move curleft cloning to a separate patch Fixes: af2f464e326e ("xfrm: Assign esn pointers when cloning a state") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/xfrm.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 12aa6e15e43f68..c00b9ae71ae404 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1773,21 +1773,17 @@ static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_es static inline int xfrm_replay_clone(struct xfrm_state *x, struct xfrm_state *orig) { - x->replay_esn = kzalloc(xfrm_replay_state_esn_len(orig->replay_esn), + + x->replay_esn = kmemdup(orig->replay_esn, + xfrm_replay_state_esn_len(orig->replay_esn), GFP_KERNEL); if (!x->replay_esn) return -ENOMEM; - - x->replay_esn->bmp_len = orig->replay_esn->bmp_len; - x->replay_esn->replay_window = orig->replay_esn->replay_window; - - x->preplay_esn = kmemdup(x->replay_esn, - xfrm_replay_state_esn_len(x->replay_esn), + x->preplay_esn = kmemdup(orig->preplay_esn, + xfrm_replay_state_esn_len(orig->preplay_esn), GFP_KERNEL); - if (!x->preplay_esn) { - kfree(x->replay_esn); + if (!x->preplay_esn) return -ENOMEM; - } return 0; } From 7ea7436c406c1f1ce02d54055e89a862d9ef0f6b Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 4 Sep 2020 08:50:11 +0200 Subject: [PATCH 51/86] xfrm: clone XFRMA_SEC_CTX in xfrm_do_migrate [ Upstream commit 7aa05d304785204703a67a6aa7f1db402889a172 ] XFRMA_SEC_CTX was not cloned from the old to the new. Migrate this attribute during XFRMA_MSG_MIGRATE v1->v2: - return -ENOMEM on error v2->v3: - fix return type to int Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_state.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 10d30f0338d720..fc1b391ba1554c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1438,6 +1438,30 @@ int xfrm_state_add(struct xfrm_state *x) EXPORT_SYMBOL(xfrm_state_add); #ifdef CONFIG_XFRM_MIGRATE +static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security) +{ + struct xfrm_user_sec_ctx *uctx; + int size = sizeof(*uctx) + security->ctx_len; + int err; + + uctx = kmalloc(size, GFP_KERNEL); + if (!uctx) + return -ENOMEM; + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = size; + uctx->ctx_doi = security->ctx_doi; + uctx->ctx_alg = security->ctx_alg; + uctx->ctx_len = security->ctx_len; + memcpy(uctx + 1, security->ctx_str, security->ctx_len); + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + if (err) + return err; + + return 0; +} + static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, struct xfrm_encap_tmpl *encap) { @@ -1494,6 +1518,10 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, goto error; } + if (orig->security) + if (clone_security(x, orig->security)) + goto error; + if (orig->coaddr) { x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr), GFP_KERNEL); From e9a12de5a2bedeeb34aa0e56d5873222226558eb Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 4 Sep 2020 08:50:29 +0200 Subject: [PATCH 52/86] xfrm: clone whole liftime_cur structure in xfrm_do_migrate [ Upstream commit 8366685b2883e523f91e9816d7be371eb1144749 ] When we clone state only add_time was cloned. It missed values like bytes, packets. Now clone the all members of the structure. v1->v3: - use memcpy to copy the entire structure Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fc1b391ba1554c..8df460eaac275d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1547,7 +1547,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; - x->curlft.add_time = orig->curlft.add_time; + memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft)); x->km.state = orig->km.state; x->km.seq = orig->km.seq; x->replay = orig->replay; From dd2786a3e5219bbcc168361892e72cc6aa80f09b Mon Sep 17 00:00:00 2001 From: Voon Weifeng Date: Wed, 23 Sep 2020 16:56:14 +0800 Subject: [PATCH 53/86] net: stmmac: removed enabling eee in EEE set callback [ Upstream commit 7241c5a697479c7d0c5a96595822cdab750d41ae ] EEE should be only be enabled during stmmac_mac_link_up() when the link are up and being set up properly. set_eee should only do settings configuration and disabling the eee. Without this fix, turning on EEE using ethtool will return "Operation not supported". This is due to the driver is in a dead loop waiting for eee to be advertised in the for eee to be activated but the driver will only configure the EEE advertisement after the eee is activated. Ethtool should only return "Operation not supported" if there is no EEE capbility in the MAC controller. Fixes: 8a7493e58ad6 ("net: stmmac: Fix a race in EEE enable callback") Signed-off-by: Voon Weifeng Acked-by: Mark Gross Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 1a768837ca728a..ce1346c14b05a0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -662,23 +662,16 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev, struct stmmac_priv *priv = netdev_priv(dev); int ret; - if (!edata->eee_enabled) { + if (!priv->dma_cap.eee) + return -EOPNOTSUPP; + + if (!edata->eee_enabled) stmmac_disable_eee_mode(priv); - } else { - /* We are asking for enabling the EEE but it is safe - * to verify all by invoking the eee_init function. - * In case of failure it will return an error. - */ - edata->eee_enabled = stmmac_eee_init(priv); - if (!edata->eee_enabled) - return -EOPNOTSUPP; - } ret = phylink_ethtool_set_eee(priv->phylink, edata); if (ret) return ret; - priv->eee_enabled = edata->eee_enabled; priv->tx_lpi_timer = edata->tx_lpi_timer; return 0; } From 4d3edb2e4d6e3e2ef5d740470a9e63d177d177ab Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Thu, 17 Sep 2020 19:16:53 +0300 Subject: [PATCH 54/86] platform/x86: fix kconfig dependency warning for FUJITSU_LAPTOP [ Upstream commit afdd1ebb72051e8b6b83c4d7dc542a9be0e1352d ] When FUJITSU_LAPTOP is enabled and NEW_LEDS is disabled, it results in the following Kbuild warning: WARNING: unmet direct dependencies detected for LEDS_CLASS Depends on [n]: NEW_LEDS [=n] Selected by [y]: - FUJITSU_LAPTOP [=y] && X86 [=y] && X86_PLATFORM_DEVICES [=y] && ACPI [=y] && INPUT [=y] && BACKLIGHT_CLASS_DEVICE [=y] && (ACPI_VIDEO [=n] || ACPI_VIDEO [=n]=n) The reason is that FUJITSU_LAPTOP selects LEDS_CLASS without depending on or selecting NEW_LEDS while LEDS_CLASS is subordinate to NEW_LEDS. Honor the kconfig menu hierarchy to remove kconfig dependency warnings. Reported-by: Hans de Goede Fixes: d89bcc83e709 ("platform/x86: fujitsu-laptop: select LEDS_CLASS") Signed-off-by: Necip Fazil Yildiran Signed-off-by: Andy Shevchenko Signed-off-by: Sasha Levin --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1cab993205142c..000d5693fae74f 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -269,6 +269,7 @@ config FUJITSU_LAPTOP depends on BACKLIGHT_CLASS_DEVICE depends on ACPI_VIDEO || ACPI_VIDEO = n select INPUT_SPARSEKMAP + select NEW_LEDS select LEDS_CLASS ---help--- This is a driver for laptops built by Fujitsu: From 84ab35eacdf244e9950505d1a5034044745ebec9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Sep 2020 14:42:56 +1000 Subject: [PATCH 55/86] xfrm: Use correct address family in xfrm_state_find [ Upstream commit e94ee171349db84c7cfdc5fefbebe414054d0924 ] The struct flowi must never be interpreted by itself as its size depends on the address family. Therefore it must always be grouped with its original family value. In this particular instance, the original family value is lost in the function xfrm_state_find. Therefore we get a bogus read when it's coupled with the wrong family which would occur with inter- family xfrm states. This patch fixes it by keeping the original family value. Note that the same bug could potentially occur in LSM through the xfrm_state_pol_flow_match hook. I checked the current code there and it seems to be safe for now as only secid is used which is part of struct flowi_common. But that API should be changed so that so that we don't get new bugs in the future. We could do that by replacing fl with just secid or adding a family field. Reported-by: syzbot+577fbac3145a6eb2e7a5@syzkaller.appspotmail.com Fixes: 48b8d78315bf ("[XFRM]: State selection update to use inner...") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_state.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8df460eaac275d..aaea8cb7459d8f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1016,7 +1016,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, */ if (x->km.state == XFRM_STATE_VALID) { if ((x->sel.family && - !xfrm_selector_match(&x->sel, fl, x->sel.family)) || + (x->sel.family != family || + !xfrm_selector_match(&x->sel, fl, family))) || !security_xfrm_state_pol_flow_match(x, pol, fl)) return; @@ -1029,7 +1030,9 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, x->sel.family) && + if ((!x->sel.family || + (x->sel.family == family && + xfrm_selector_match(&x->sel, fl, family))) && security_xfrm_state_pol_flow_match(x, pol, fl)) *error = -ESRCH; } @@ -1069,7 +1072,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } if (best || acquire_in_progress) @@ -1086,7 +1089,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } From 029ced5cce896fc0ee3dd0b7600cdbca028a868c Mon Sep 17 00:00:00 2001 From: Vaibhav Gupta Date: Mon, 29 Jun 2020 14:59:39 +0530 Subject: [PATCH 56/86] iavf: use generic power management [ Upstream commit bc5cbd73eb493944b8665dc517f684c40eb18a4a ] With the support of generic PM callbacks, drivers no longer need to use legacy .suspend() and .resume() in which they had to maintain PCI states changes and device's power state themselves. The required operations are done by PCI core. PCI drivers are not expected to invoke PCI helper functions like pci_save/restore_state(), pci_enable/disable_device(), pci_set_power_state(), etc. Their tasks are completed by PCI core itself. Compile-tested only. Signed-off-by: Vaibhav Gupta Tested-by: Andrew Bowers Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 45 ++++++--------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 222ae76809aa1a..bcb95b2ea792f2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3773,7 +3773,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return err; } -#ifdef CONFIG_PM /** * iavf_suspend - Power management suspend routine * @pdev: PCI device information struct @@ -3781,11 +3780,10 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * * Called when the system (VM) is entering sleep/suspend. **/ -static int iavf_suspend(struct pci_dev *pdev, pm_message_t state) +static int __maybe_unused iavf_suspend(struct device *dev_d) { - struct net_device *netdev = pci_get_drvdata(pdev); + struct net_device *netdev = dev_get_drvdata(dev_d); struct iavf_adapter *adapter = netdev_priv(netdev); - int retval = 0; netif_device_detach(netdev); @@ -3803,12 +3801,6 @@ static int iavf_suspend(struct pci_dev *pdev, pm_message_t state) clear_bit(__IAVF_IN_CRITICAL_TASK, &adapter->crit_section); - retval = pci_save_state(pdev); - if (retval) - return retval; - - pci_disable_device(pdev); - return 0; } @@ -3818,24 +3810,13 @@ static int iavf_suspend(struct pci_dev *pdev, pm_message_t state) * * Called when the system (VM) is resumed from sleep/suspend. **/ -static int iavf_resume(struct pci_dev *pdev) +static int __maybe_unused iavf_resume(struct device *dev_d) { + struct pci_dev *pdev = to_pci_dev(dev_d); struct iavf_adapter *adapter = pci_get_drvdata(pdev); struct net_device *netdev = adapter->netdev; u32 err; - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - /* pci_restore_state clears dev->state_saved so call - * pci_save_state to restore it. - */ - pci_save_state(pdev); - - err = pci_enable_device_mem(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device from suspend.\n"); - return err; - } pci_set_master(pdev); rtnl_lock(); @@ -3859,7 +3840,6 @@ static int iavf_resume(struct pci_dev *pdev) return err; } -#endif /* CONFIG_PM */ /** * iavf_remove - Device Removal Routine * @pdev: PCI device information struct @@ -3961,16 +3941,15 @@ static void iavf_remove(struct pci_dev *pdev) pci_disable_device(pdev); } +static SIMPLE_DEV_PM_OPS(iavf_pm_ops, iavf_suspend, iavf_resume); + static struct pci_driver iavf_driver = { - .name = iavf_driver_name, - .id_table = iavf_pci_tbl, - .probe = iavf_probe, - .remove = iavf_remove, -#ifdef CONFIG_PM - .suspend = iavf_suspend, - .resume = iavf_resume, -#endif - .shutdown = iavf_shutdown, + .name = iavf_driver_name, + .id_table = iavf_pci_tbl, + .probe = iavf_probe, + .remove = iavf_remove, + .driver.pm = &iavf_pm_ops, + .shutdown = iavf_shutdown, }; /** From e987ea087fd2920dfc2699472371869bb59257b4 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Wed, 2 Sep 2020 12:54:59 +0000 Subject: [PATCH 57/86] iavf: Fix incorrect adapter get in iavf_resume [ Upstream commit 75598a8fc0e0dff2aa5d46c62531b36a595f1d4f ] When calling iavf_resume there was a crash because wrong function was used to get iavf_adapter and net_device pointers. Changed how iavf_resume is getting iavf_adapter and net_device pointers from pci_dev. Fixes: 5eae00c57f5e ("i40evf: main driver core") Signed-off-by: Sylwester Dziedziuch Reviewed-by: Aleksandr Loktionov Tested-by: Aaron Brown Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index bcb95b2ea792f2..cd95d6af8fc1b6 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3813,8 +3813,8 @@ static int __maybe_unused iavf_suspend(struct device *dev_d) static int __maybe_unused iavf_resume(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); - struct iavf_adapter *adapter = pci_get_drvdata(pdev); - struct net_device *netdev = adapter->netdev; + struct net_device *netdev = pci_get_drvdata(pdev); + struct iavf_adapter *adapter = netdev_priv(netdev); u32 err; pci_set_master(pdev); From 3ce96a55b7561ff59b763d68db54d76a3436c418 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Fri, 25 Sep 2020 15:44:39 +0300 Subject: [PATCH 58/86] net: ethernet: cavium: octeon_mgmt: use phy_start and phy_stop [ Upstream commit 4663ff60257aec4ee1e2e969a7c046f0aff35ab8 ] To start also "phy state machine", with UP state as it should be, the phy_start() has to be used, in another case machine even is not triggered. After this change negotiation is supposed to be triggered by SM workqueue. It's not correct usage, but it appears after the following patch, so add it as a fix. Fixes: 74a992b3598a ("net: phy: add phy_check_link_status") Signed-off-by: Ivan Khoronzhuk Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index d375e438d8054e..4fa9d485e2096e 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -1222,7 +1222,7 @@ static int octeon_mgmt_open(struct net_device *netdev) */ if (netdev->phydev) { netif_carrier_off(netdev); - phy_start_aneg(netdev->phydev); + phy_start(netdev->phydev); } netif_wake_queue(netdev); @@ -1250,8 +1250,10 @@ static int octeon_mgmt_stop(struct net_device *netdev) napi_disable(&p->napi); netif_stop_queue(netdev); - if (netdev->phydev) + if (netdev->phydev) { + phy_stop(netdev->phydev); phy_disconnect(netdev->phydev); + } netif_carrier_off(netdev); From 8d103b1f9ce52d06e4db3c829e1d49756936ec43 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2020 06:38:07 -0700 Subject: [PATCH 59/86] bonding: set dev->needed_headroom in bond_setup_by_slave() [ Upstream commit f32f19339596b214c208c0dba716f4b6cc4f6958 ] syzbot managed to crash a host by creating a bond with a GRE device. For non Ethernet device, bonding calls bond_setup_by_slave() instead of ether_setup(), and unfortunately dev->needed_headroom was not copied from the new added member. [ 171.243095] skbuff: skb_under_panic: text:ffffffffa184b9ea len:116 put:20 head:ffff883f84012dc0 data:ffff883f84012dbc tail:0x70 end:0xd00 dev:bond0 [ 171.243111] ------------[ cut here ]------------ [ 171.243112] kernel BUG at net/core/skbuff.c:112! [ 171.243117] invalid opcode: 0000 [#1] SMP KASAN PTI [ 171.243469] gsmi: Log Shutdown Reason 0x03 [ 171.243505] Call Trace: [ 171.243506] [ 171.243512] [] skb_push+0x49/0x50 [ 171.243516] [] ipgre_header+0x2a/0xf0 [ 171.243520] [] neigh_connected_output+0xb7/0x100 [ 171.243524] [] ip6_finish_output2+0x383/0x490 [ 171.243528] [] __ip6_finish_output+0xa2/0x110 [ 171.243531] [] ip6_finish_output+0x2c/0xa0 [ 171.243534] [] ip6_output+0x69/0x110 [ 171.243537] [] ? ip6_output+0x110/0x110 [ 171.243541] [] mld_sendpack+0x1b2/0x2d0 [ 171.243544] [] ? mld_send_report+0xf0/0xf0 [ 171.243548] [] mld_ifc_timer_expire+0x2d7/0x3b0 [ 171.243551] [] ? mld_gq_timer_expire+0x50/0x50 [ 171.243556] [] call_timer_fn+0x30/0x130 [ 171.243559] [] expire_timers+0x4c/0x110 [ 171.243563] [] __run_timers+0x213/0x260 [ 171.243566] [] ? ktime_get+0x3d/0xa0 [ 171.243570] [] ? clockevents_program_event+0x7e/0xe0 [ 171.243574] [] ? sched_clock_cpu+0x15/0x190 [ 171.243577] [] run_timer_softirq+0x1d/0x40 [ 171.243581] [] __do_softirq+0x152/0x2f0 [ 171.243585] [] irq_exit+0x9f/0xb0 [ 171.243588] [] smp_apic_timer_interrupt+0xfd/0x1a0 [ 171.243591] [] apic_timer_interrupt+0x86/0x90 Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0d7a173f8e61c7..6862c2ef244245 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1148,6 +1148,7 @@ static void bond_setup_by_slave(struct net_device *bond_dev, bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; + bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len; memcpy(bond_dev->broadcast, slave_dev->broadcast, From f3b35c3782ed76cdd13b2f6af700dcd5d73ac186 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Sep 2020 21:33:43 -0700 Subject: [PATCH 60/86] mdio: fix mdio-thunder.c dependency & build error [ Upstream commit 7dbbcf496f2a4b6d82cfc7810a0746e160b79762 ] Fix build error by selecting MDIO_DEVRES for MDIO_THUNDER. Fixes this build error: ld: drivers/net/phy/mdio-thunder.o: in function `thunder_mdiobus_pci_probe': drivers/net/phy/mdio-thunder.c:78: undefined reference to `devm_mdiobus_alloc_size' Fixes: 379d7ac7ca31 ("phy: mdio-thunder: Add driver for Cavium Thunder SoC MDIO buses.") Reported-by: kernel test robot Signed-off-by: Randy Dunlap Cc: Bartosz Golaszewski Cc: Andrew Lunn Cc: Heiner Kallweit Cc: netdev@vger.kernel.org Cc: David Daney Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/phy/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index fe602648b99f50..dcf2051ef2c04f 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -193,6 +193,7 @@ config MDIO_THUNDER depends on 64BIT depends on PCI select MDIO_CAVIUM + select MDIO_DEVRES help This driver supports the MDIO interfaces found on Cavium ThunderX SoCs when the MDIO bus device appears as a PCI From 7a96cbd74fcd0ac34680d5ed842e5629bc7aa75b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 27 Sep 2020 09:42:11 +0300 Subject: [PATCH 61/86] mlxsw: spectrum_acl: Fix mlxsw_sp_acl_tcam_group_add()'s error path [ Upstream commit 72865028582a678be1e05240e55d452e5c258eca ] If mlxsw_sp_acl_tcam_group_id_get() fails, the mutex initialized earlier is not destroyed. Fix this by initializing the mutex after calling the function. This is symmetric to mlxsw_sp_acl_tcam_group_del(). Fixes: 5ec2ee28d27b ("mlxsw: spectrum_acl: Introduce a mutex to guard region list updates") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 295b27112d3675..ec0d5a4a60a989 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -290,13 +290,14 @@ mlxsw_sp_acl_tcam_group_add(struct mlxsw_sp_acl_tcam *tcam, int err; group->tcam = tcam; - mutex_init(&group->lock); INIT_LIST_HEAD(&group->region_list); err = mlxsw_sp_acl_tcam_group_id_get(tcam, &group->id); if (err) return err; + mutex_init(&group->lock); + return 0; } From 70877d04d41f2ff97818ec5d77858cf0ab8f8e89 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 27 Sep 2020 19:44:29 +0200 Subject: [PATCH 62/86] r8169: fix RTL8168f/RTL8411 EPHY config [ Upstream commit 709a16be0593c08190982cfbdca6df95e6d5823b ] Mistakenly bit 2 was set instead of bit 3 as in the vendor driver. Fixes: a7a92cf81589 ("r8169: sync PCIe PHY init with vendor driver 8.047.01") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 903212ad9bb2f7..66c97049f52b74 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4701,7 +4701,7 @@ static void rtl_hw_start_8168f_1(struct rtl8169_private *tp) { 0x08, 0x0001, 0x0002 }, { 0x09, 0x0000, 0x0080 }, { 0x19, 0x0000, 0x0224 }, - { 0x00, 0x0000, 0x0004 }, + { 0x00, 0x0000, 0x0008 }, { 0x0c, 0x3df0, 0x0200 }, }; @@ -4718,7 +4718,7 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp) { 0x06, 0x00c0, 0x0020 }, { 0x0f, 0xffff, 0x5200 }, { 0x19, 0x0000, 0x0224 }, - { 0x00, 0x0000, 0x0004 }, + { 0x00, 0x0000, 0x0008 }, { 0x0c, 0x3df0, 0x0200 }, }; From 15f84bdf6185024e48de54a2fc136a71632412e8 Mon Sep 17 00:00:00 2001 From: Wilken Gottwalt Date: Mon, 28 Sep 2020 11:01:04 +0200 Subject: [PATCH 63/86] net: usb: ax88179_178a: fix missing stop entry in driver_info [ Upstream commit 9666ea66a74adfe295cb3a8760c76e1ef70f9caf ] Adds the missing .stop entry in the Belkin driver_info structure. Fixes: e20bd60bf62a ("net: usb: asix88179_178a: Add support for the Belkin B2B128") Signed-off-by: Wilken Gottwalt Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/ax88179_178a.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index df2f7cc6dc03a6..8e37e1f58c4b9e 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1719,6 +1719,7 @@ static const struct driver_info belkin_info = { .status = ax88179_status, .link_reset = ax88179_link_reset, .reset = ax88179_reset, + .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, .tx_fixup = ax88179_tx_fixup, From 0955c774f32d15d39d29548273cccd39bbc9ec23 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 29 Sep 2020 09:58:06 +0800 Subject: [PATCH 64/86] virtio-net: don't disable guest csum when disable LRO [ Upstream commit 1a03b8a35a957f9f38ecb8a97443b7380bbf6a8b ] Open vSwitch and Linux bridge will disable LRO of the interface when this interface added to them. Now when disable the LRO, the virtio-net csum is disable too. That drops the forwarding performance. Fixes: a02e8964eaf9 ("virtio-net: ethtool configurable LRO") Cc: Michael S. Tsirkin Cc: Jason Wang Cc: Willem de Bruijn Signed-off-by: Tonghao Zhang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/virtio_net.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 030d30603c295f..99e1a7bc068863 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -63,6 +63,11 @@ static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_CSUM }; +#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ + (1ULL << VIRTIO_NET_F_GUEST_UFO)) + struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; @@ -2572,7 +2577,8 @@ static int virtnet_set_features(struct net_device *dev, if (features & NETIF_F_LRO) offloads = vi->guest_offloads_capable; else - offloads = 0; + offloads = vi->guest_offloads_capable & + ~GUEST_OFFLOAD_LRO_MASK; err = virtnet_set_guest_offloads(vi, offloads); if (err) From 073fff8102062cd675170ceb54d90da22fe7e668 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 4 Aug 2020 10:40:21 +0300 Subject: [PATCH 65/86] net/mlx5: Avoid possible free of command entry while timeout comp handler [ Upstream commit 50b2412b7e7862c5af0cbf4b10d93bc5c712d021 ] Upon command completion timeout, driver simulates a forced command completion. In a rare case where real interrupt for that command arrives simultaneously, it might release the command entry while the forced handler might still access it. Fix that by adding an entry refcount, to track current amount of allowed handlers. Command entry to be released only when this refcount is decremented to zero. Command refcount is always initialized to one. For callback commands, command completion handler is the symmetric flow to decrement it. For non-callback commands, it is wait_func(). Before ringing the doorbell, increment the refcount for the real completion handler. Once the real completion handler is called, it will decrement it. For callback commands, once the delayed work is scheduled, increment the refcount. Upon callback command completion handler, we will try to cancel the timeout callback. In case of success, we need to decrement the callback refcount as it will never run. In addition, gather the entry index free and the entry free into a one flow for all command types release. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 109 ++++++++++++------ include/linux/mlx5/driver.h | 2 + 2 files changed, 73 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index b6a3370068f1ca..7089ffcc4e5123 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -69,12 +69,10 @@ enum { MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR = 0x10, }; -static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, - struct mlx5_cmd_msg *in, - struct mlx5_cmd_msg *out, - void *uout, int uout_size, - mlx5_cmd_cbk_t cbk, - void *context, int page_queue) +static struct mlx5_cmd_work_ent * +cmd_alloc_ent(struct mlx5_cmd *cmd, struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t cbk, void *context, int page_queue) { gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL; struct mlx5_cmd_work_ent *ent; @@ -83,6 +81,7 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, if (!ent) return ERR_PTR(-ENOMEM); + ent->idx = -EINVAL; ent->in = in; ent->out = out; ent->uout = uout; @@ -91,10 +90,16 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, ent->context = context; ent->cmd = cmd; ent->page_queue = page_queue; + refcount_set(&ent->refcnt, 1); return ent; } +static void cmd_free_ent(struct mlx5_cmd_work_ent *ent) +{ + kfree(ent); +} + static u8 alloc_token(struct mlx5_cmd *cmd) { u8 token; @@ -109,7 +114,7 @@ static u8 alloc_token(struct mlx5_cmd *cmd) return token; } -static int alloc_ent(struct mlx5_cmd *cmd) +static int cmd_alloc_index(struct mlx5_cmd *cmd) { unsigned long flags; int ret; @@ -123,7 +128,7 @@ static int alloc_ent(struct mlx5_cmd *cmd) return ret < cmd->max_reg_cmds ? ret : -ENOMEM; } -static void free_ent(struct mlx5_cmd *cmd, int idx) +static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { unsigned long flags; @@ -132,6 +137,22 @@ static void free_ent(struct mlx5_cmd *cmd, int idx) spin_unlock_irqrestore(&cmd->alloc_lock, flags); } +static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) +{ + refcount_inc(&ent->refcnt); +} + +static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) +{ + if (!refcount_dec_and_test(&ent->refcnt)) + return; + + if (ent->idx >= 0) + cmd_free_index(ent->cmd, ent->idx); + + cmd_free_ent(ent); +} + static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) { return cmd->cmd_buf + (idx << cmd->log_stride); @@ -219,11 +240,6 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent) ent->ret = -ETIMEDOUT; } -static void free_cmd(struct mlx5_cmd_work_ent *ent) -{ - kfree(ent); -} - static int verify_signature(struct mlx5_cmd_work_ent *ent) { struct mlx5_cmd_mailbox *next = ent->out->next; @@ -842,6 +858,7 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */ } static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); @@ -865,14 +882,14 @@ static void cmd_work_handler(struct work_struct *work) sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { - alloc_ret = alloc_ent(cmd); + alloc_ret = cmd_alloc_index(cmd); if (alloc_ret < 0) { mlx5_core_err(dev, "failed to allocate command entry\n"); if (ent->callback) { ent->callback(-EAGAIN, ent->context); mlx5_free_cmd_msg(dev, ent->out); free_msg(dev, ent->in); - free_cmd(ent); + cmd_ent_put(ent); } else { ent->ret = -EAGAIN; complete(&ent->done); @@ -908,8 +925,8 @@ static void cmd_work_handler(struct work_struct *work) ent->ts1 = ktime_get_ns(); cmd_mode = cmd->mode; - if (ent->callback) - schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) + cmd_ent_get(ent); set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); /* Skip sending command to fw if internal error */ @@ -923,13 +940,10 @@ static void cmd_work_handler(struct work_struct *work) MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); - /* no doorbell, no need to keep the entry */ - free_ent(cmd, ent->idx); - if (ent->callback) - free_cmd(ent); return; } + cmd_ent_get(ent); /* for the _real_ FW event on completion */ /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -1029,11 +1043,16 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, if (callback && page_queue) return -EINVAL; - ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context, - page_queue); + ent = cmd_alloc_ent(cmd, in, out, uout, uout_size, + callback, context, page_queue); if (IS_ERR(ent)) return PTR_ERR(ent); + /* put for this ent is when consumed, depending on the use case + * 1) (!callback) blocking flow: by caller after wait_func completes + * 2) (callback) flow: by mlx5_cmd_comp_handler() when ent is handled + */ + ent->token = token; ent->polling = force_polling; @@ -1052,12 +1071,10 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, } if (callback) - goto out; + goto out; /* mlx5_cmd_comp_handler() will put(ent) */ err = wait_func(dev, ent); - if (err == -ETIMEDOUT) - goto out; - if (err == -ECANCELED) + if (err == -ETIMEDOUT || err == -ECANCELED) goto out_free; ds = ent->ts2 - ent->ts1; @@ -1075,7 +1092,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, *status = ent->status; out_free: - free_cmd(ent); + cmd_ent_put(ent); out: return err; } @@ -1490,14 +1507,19 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force if (!forced) { mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", ent->idx); - free_ent(cmd, ent->idx); - free_cmd(ent); + cmd_ent_put(ent); } continue; } - if (ent->callback) - cancel_delayed_work(&ent->cb_timeout_work); + if (ent->callback && cancel_delayed_work(&ent->cb_timeout_work)) + cmd_ent_put(ent); /* timeout work was canceled */ + + if (!forced || /* Real FW completion */ + pci_channel_offline(dev->pdev) || /* FW is inaccessible */ + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + cmd_ent_put(ent); + if (ent->page_queue) sem = &cmd->pages_sem; else @@ -1519,10 +1541,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force ent->ret, deliv_status_to_str(ent->status), ent->status); } - /* only real completion will free the entry slot */ - if (!forced) - free_ent(cmd, ent->idx); - if (ent->callback) { ds = ent->ts2 - ent->ts1; if (ent->op < ARRAY_SIZE(cmd->stats)) { @@ -1550,10 +1568,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force free_msg(dev, ent->in); err = err ? err : ent->status; - if (!forced) - free_cmd(ent); + /* final consumer is done, release ent */ + cmd_ent_put(ent); callback(err, context); } else { + /* release wait_func() so mlx5_cmd_invoke() + * can make the final ent_put() + */ complete(&ent->done); } up(sem); @@ -1563,8 +1584,11 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) { + struct mlx5_cmd *cmd = &dev->cmd; + unsigned long bitmask; unsigned long flags; u64 vector; + int i; /* wait for pending handlers to complete */ mlx5_eq_synchronize_cmd_irq(dev); @@ -1573,11 +1597,20 @@ void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) if (!vector) goto no_trig; + bitmask = vector; + /* we must increment the allocated entries refcount before triggering the completions + * to guarantee pending commands will not get freed in the meanwhile. + * For that reason, it also has to be done inside the alloc_lock. + */ + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_get(cmd->ent_arr[i]); vector |= MLX5_TRIGGERED_CMD_COMP; spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); mlx5_core_dbg(dev, "vector 0x%llx\n", vector); mlx5_cmd_comp_handler(dev, vector, true); + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_put(cmd->ent_arr[i]); return; no_trig: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 897829651204bd..6b4f86dfca3827 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -769,6 +769,8 @@ struct mlx5_cmd_work_ent { u64 ts2; u16 op; bool polling; + /* Track the max comp handlers */ + refcount_t refcnt; }; struct mlx5_pas { From 1e7a94724b78ad81dbe0f0fc9552adfa8ce927b0 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 31 Aug 2020 21:37:31 +0300 Subject: [PATCH 66/86] net/mlx5: Fix request_irqs error flow [ Upstream commit 732ebfab7fe96b7ac9a3df3208f14752a4bb6db3 ] Fix error flow handling in request_irqs which try to free irq that we failed to request. It fixes the below trace. WARNING: CPU: 1 PID: 7587 at kernel/irq/manage.c:1684 free_irq+0x4d/0x60 CPU: 1 PID: 7587 Comm: bash Tainted: G W OE 4.15.15-1.el7MELLANOXsmp-x86_64 #1 Hardware name: Advantech SKY-6200/SKY-6200, BIOS F2.00 08/06/2020 RIP: 0010:free_irq+0x4d/0x60 RSP: 0018:ffffc9000ef47af0 EFLAGS: 00010282 RAX: ffff88001476ae00 RBX: 0000000000000655 RCX: 0000000000000000 RDX: ffff88001476ae00 RSI: ffffc9000ef47ab8 RDI: ffff8800398bb478 RBP: ffff88001476a838 R08: ffff88001476ae00 R09: 000000000000156d R10: 0000000000000000 R11: 0000000000000004 R12: ffff88001476a838 R13: 0000000000000006 R14: ffff88001476a888 R15: 00000000ffffffe4 FS: 00007efeadd32740(0000) GS:ffff88047fc40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc9cc010008 CR3: 00000001a2380004 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: mlx5_irq_table_create+0x38d/0x400 [mlx5_core] ? atomic_notifier_chain_register+0x50/0x60 mlx5_load_one+0x7ee/0x1130 [mlx5_core] init_one+0x4c9/0x650 [mlx5_core] pci_device_probe+0xb8/0x120 driver_probe_device+0x2a1/0x470 ? driver_allows_async_probing+0x30/0x30 bus_for_each_drv+0x54/0x80 __device_attach+0xa3/0x100 pci_bus_add_device+0x4a/0x90 pci_iov_add_virtfn+0x2dc/0x2f0 pci_enable_sriov+0x32e/0x420 mlx5_core_sriov_configure+0x61/0x1b0 [mlx5_core] ? kstrtoll+0x22/0x70 num_vf_store+0x4b/0x70 [mlx5_core] kernfs_fop_write+0x102/0x180 __vfs_write+0x26/0x140 ? rcu_all_qs+0x5/0x80 ? _cond_resched+0x15/0x30 ? __sb_start_write+0x41/0x80 vfs_write+0xad/0x1a0 SyS_write+0x42/0x90 do_syscall_64+0x60/0x110 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 24163189da48 ("net/mlx5: Separate IRQ request/free from EQ life cycle") Signed-off-by: Maor Gottlieb Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 373981a659c7c7..6fd9749203944c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -115,7 +115,7 @@ static int request_irqs(struct mlx5_core_dev *dev, int nvec) return 0; err_request_irq: - for (; i >= 0; i--) { + while (i--) { struct mlx5_irq *irq = mlx5_irq_get(dev, i); int irqn = pci_irq_vector(dev->pdev, i); From 47e83c69fe14c03e0d8ca96d33678b010a88b297 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 20 Jul 2020 16:53:18 +0300 Subject: [PATCH 67/86] net/mlx5e: Add resiliency in Striding RQ mode for packets larger than MTU [ Upstream commit c3c9402373fe20e2d08c04f437ce4dcd252cffb2 ] Prior to this fix, in Striding RQ mode the driver was vulnerable when receiving packets in the range (stride size - headroom, stride size]. Where stride size is calculated by mtu+headroom+tailroom aligned to the closest power of 2. Usually, this filtering is performed by the HW, except for a few cases: - Between 2 VFs over the same PF with different MTUs - On bluefield, when the host physical function sets a larger MTU than the ARM has configured on its representor and uplink representor. When the HW filtering is not present, packets that are larger than MTU might be harmful for the RQ's integrity, in the following impacts: 1) Overflow from one WQE to the next, causing a memory corruption that in most cases is unharmful: as the write happens to the headroom of next packet, which will be overwritten by build_skb(). In very rare cases, high stress/load, this is harmful. When the next WQE is not yet reposted and points to existing SKB head. 2) Each oversize packet overflows to the headroom of the next WQE. On the last WQE of the WQ, where addresses wrap-around, the address of the remainder headroom does not belong to the next WQE, but it is out of the memory region range. This results in a HW CQE error that moves the RQ into an error state. Solution: Add a page buffer at the end of each WQE to absorb the leak. Actually the maximal overflow size is headroom but since all memory units must be of the same size, we use page size to comply with UMR WQEs. The increase in memory consumption is of a single page per RQ. Initialize the mkey with all MTTs pointing to a default page. When the channels are activated, UMR WQEs will redirect the RX WQEs to the actual memory from the RQ's pool, while the overflow MTTs remain mapped to the default page. Fixes: 73281b78a37a ("net/mlx5e: Derive Striding RQ size from MTU") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 8 ++- .../net/ethernet/mellanox/mlx5/core/en_main.c | 55 +++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 98304c42e49528..b5c8afe8cd10d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -92,7 +92,12 @@ struct page_pool; #define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER) #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2) -#define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8)) +/* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between + * WQEs, This page will absorb write overflow by the hardware, when + * receiving packets larger than MTU. These oversize packets are + * dropped by the driver at a later stage. + */ +#define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8)) #define MLX5E_LOG_ALIGNED_MPWQE_PPW (ilog2(MLX5E_REQUIRED_WQE_MTTS)) #define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS) #define MLX5E_MAX_RQ_NUM_MTTS \ @@ -694,6 +699,7 @@ struct mlx5e_rq { u32 rqn; struct mlx5_core_dev *mdev; struct mlx5_core_mkey umr_mkey; + struct mlx5e_dma_info wqe_overflow; /* XDP read-mostly */ struct xdp_rxq_info xdp_rxq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ee0d78f801af5c..fc710648d2ef37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -266,12 +266,17 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, u64 npages, u8 page_shift, - struct mlx5_core_mkey *umr_mkey) + struct mlx5_core_mkey *umr_mkey, + dma_addr_t filler_addr) { - int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_mtt *mtt; + int inlen; void *mkc; u32 *in; int err; + int i; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(*mtt) * npages; in = kvzalloc(inlen, GFP_KERNEL); if (!in) @@ -291,6 +296,18 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, MLX5_SET(mkc, mkc, translations_octword_size, MLX5_MTT_OCTW(npages)); MLX5_SET(mkc, mkc, log_page_size, page_shift); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + MLX5_MTT_OCTW(npages)); + + /* Initialize the mkey with all MTTs pointing to a default + * page (filler_addr). When the channels are activated, UMR + * WQEs will redirect the RX WQEs to the actual memory from + * the RQ's pool, while the gaps (wqe_overflow) remain mapped + * to the default page. + */ + mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + for (i = 0 ; i < npages ; i++) + mtt[i].ptag = cpu_to_be64(filler_addr); err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen); @@ -302,7 +319,8 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq { u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq)); - return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey); + return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey, + rq->wqe_overflow.addr); } static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix) @@ -370,6 +388,28 @@ static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work) mlx5e_reporter_rq_cqe_err(rq); } +static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) +{ + rq->wqe_overflow.page = alloc_page(GFP_KERNEL); + if (!rq->wqe_overflow.page) + return -ENOMEM; + + rq->wqe_overflow.addr = dma_map_page(rq->pdev, rq->wqe_overflow.page, 0, + PAGE_SIZE, rq->buff.map_dir); + if (dma_mapping_error(rq->pdev, rq->wqe_overflow.addr)) { + __free_page(rq->wqe_overflow.page); + return -ENOMEM; + } + return 0; +} + +static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) +{ + dma_unmap_page(rq->pdev, rq->wqe_overflow.addr, PAGE_SIZE, + rq->buff.map_dir); + __free_page(rq->wqe_overflow.page); +} + static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, @@ -434,6 +474,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, if (err) goto err_rq_wq_destroy; + err = mlx5e_alloc_mpwqe_rq_drop_page(rq); + if (err) + goto err_rq_wq_destroy; + rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR]; wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); @@ -474,7 +518,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, err = mlx5e_create_rq_umr_mkey(mdev, rq); if (err) - goto err_rq_wq_destroy; + goto err_rq_drop_page; rq->mkey_be = cpu_to_be32(rq->umr_mkey.key); err = mlx5e_rq_alloc_mpwqe_info(rq, c); @@ -622,6 +666,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: kvfree(rq->mpwqe.info); mlx5_core_destroy_mkey(mdev, &rq->umr_mkey); +err_rq_drop_page: + mlx5e_free_mpwqe_rq_drop_page(rq); break; default: /* MLX5_WQ_TYPE_CYCLIC */ kvfree(rq->wqe.frags); @@ -649,6 +695,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: kvfree(rq->mpwqe.info); mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey); + mlx5e_free_mpwqe_rq_drop_page(rq); break; default: /* MLX5_WQ_TYPE_CYCLIC */ kvfree(rq->wqe.frags); From 6b9752d85e72f8681e3df839b866bd3f1a7c70dc Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 13 Sep 2020 17:57:23 +0300 Subject: [PATCH 68/86] net/mlx5e: Fix VLAN cleanup flow [ Upstream commit 8c7353b6f716436ad0bfda2b5c5524ab2dde5894 ] Prior to this patch unloading an interface in promiscuous mode with RX VLAN filtering feature turned off - resulted in a warning. This is due to a wrong condition in the VLAN rules cleanup flow, which left the any-vid rules in the VLAN steering table. These rules prevented destroying the flow group and the flow table. The any-vid rules are removed in 2 flows, but none of them remove it in case both promiscuous is set and VLAN filtering is off. Fix the issue by changing the condition of the VLAN table cleanup flow to clean also in case of promiscuous mode. mlx5_core 0000:00:08.0: mlx5_destroy_flow_group:2123:(pid 28729): Flow group 20 wasn't destroyed, refcount > 1 mlx5_core 0000:00:08.0: mlx5_destroy_flow_group:2123:(pid 28729): Flow group 19 wasn't destroyed, refcount > 1 mlx5_core 0000:00:08.0: mlx5_destroy_flow_table:2112:(pid 28729): Flow table 262149 wasn't destroyed, refcount > 1 ... ... ------------[ cut here ]------------ FW pages counter is 11560 after reclaiming all pages WARNING: CPU: 1 PID: 28729 at drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:660 mlx5_reclaim_startup_pages+0x178/0x230 [mlx5_core] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: mlx5_function_teardown+0x2f/0x90 [mlx5_core] mlx5_unload_one+0x71/0x110 [mlx5_core] remove_one+0x44/0x80 [mlx5_core] pci_device_remove+0x3e/0xc0 device_release_driver_internal+0xfb/0x1c0 device_release_driver+0x12/0x20 pci_stop_bus_device+0x68/0x90 pci_stop_and_remove_bus_device+0x12/0x20 hv_eject_device_work+0x6f/0x170 [pci_hyperv] ? __schedule+0x349/0x790 process_one_work+0x206/0x400 worker_thread+0x34/0x3f0 ? process_one_work+0x400/0x400 kthread+0x126/0x140 ? kthread_park+0x90/0x90 ret_from_fork+0x22/0x30 ---[ end trace 6283bde8d26170dc ]--- Fixes: 9df30601c843 ("net/mlx5e: Restore vlan filter after seamless reset") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 73d3dc07331f19..c5be0cdfaf0fac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -415,8 +415,12 @@ static void mlx5e_del_vlan_rules(struct mlx5e_priv *priv) for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID) mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); - if (priv->fs.vlan.cvlan_filter_disabled && - !(priv->netdev->flags & IFF_PROMISC)) + WARN_ON_ONCE(!(test_bit(MLX5E_STATE_DESTROYING, &priv->state))); + + /* must be called after DESTROY bit is set and + * set_rx_mode is called and flushed + */ + if (priv->fs.vlan.cvlan_filter_disabled) mlx5e_del_any_vid_rules(priv); } From 9a52da3f61b45fea94478b59e2f3829a64a615a5 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 13 Sep 2020 18:05:40 +0300 Subject: [PATCH 69/86] net/mlx5e: Fix VLAN create flow [ Upstream commit d4a16052bccdd695982f89d815ca075825115821 ] When interface is attached while in promiscuous mode and with VLAN filtering turned off, both configurations are not respected and VLAN filtering is performed. There are 2 flows which add the any-vid rules during interface attach: VLAN creation table and set rx mode. Each is relaying on the other to add any-vid rules, eventually non of them does. Fix this by adding any-vid rules on VLAN creation regardless of promiscuous mode. Fixes: 9df30601c843 ("net/mlx5e: Restore vlan filter after seamless reset") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index c5be0cdfaf0fac..713dc210f710cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -217,6 +217,9 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv, break; } + if (WARN_ONCE(*rule_p, "VLAN rule already exists type %d", rule_type)) + return 0; + *rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); if (IS_ERR(*rule_p)) { @@ -397,8 +400,7 @@ static void mlx5e_add_vlan_rules(struct mlx5e_priv *priv) for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID) mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); - if (priv->fs.vlan.cvlan_filter_disabled && - !(priv->netdev->flags & IFF_PROMISC)) + if (priv->fs.vlan.cvlan_filter_disabled) mlx5e_add_any_vid_rules(priv); } From 7e1f39b5c1d5ba70a70844658cbde0d0ab8f7e1d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 4 Sep 2020 14:01:24 -0300 Subject: [PATCH 70/86] rxrpc: Fix rxkad token xdr encoding [ Upstream commit 56305118e05b2db8d0395bba640ac9a3aee92624 ] The session key should be encoded with just the 8 data bytes and no length; ENCODE_DATA precedes it with a 4 byte length, which confuses some existing tools that try to parse this format. Add an ENCODE_BYTES macro that does not include a length, and use it for the key. Also adjust the expected length. Note that commit 774521f353e1d ("rxrpc: Fix an assertion in rxrpc_read()") had fixed a BUG by changing the length rather than fixing the encoding. The original length was correct. Fixes: 99455153d067 ("RxRPC: Parse security index 5 keys (Kerberos 5)") Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Sasha Levin --- net/rxrpc/key.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 0c98313dd7a8cb..d77e89766406aa 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -1073,7 +1073,7 @@ static long rxrpc_read(const struct key *key, switch (token->security_index) { case RXRPC_SECURITY_RXKAD: - toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin, + toksize += 8 * 4; /* viceid, kvno, key*2, begin, * end, primary, tktlen */ toksize += RND(token->kad->ticket_len); break; @@ -1139,6 +1139,14 @@ static long rxrpc_read(const struct key *key, memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) +#define ENCODE_BYTES(l, s) \ + do { \ + u32 _l = (l); \ + memcpy(xdr, (s), _l); \ + if (_l & 3) \ + memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ + xdr += (_l + 3) >> 2; \ + } while(0) #define ENCODE64(x) \ do { \ __be64 y = cpu_to_be64(x); \ @@ -1166,7 +1174,7 @@ static long rxrpc_read(const struct key *key, case RXRPC_SECURITY_RXKAD: ENCODE(token->kad->vice_id); ENCODE(token->kad->kvno); - ENCODE_DATA(8, token->kad->session_key); + ENCODE_BYTES(8, token->kad->session_key); ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); From 422f5c5d3ef9eb44eaac710504593da733aca985 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 8 Sep 2020 22:09:04 +0100 Subject: [PATCH 71/86] rxrpc: Downgrade the BUG() for unsupported token type in rxrpc_read() [ Upstream commit 9a059cd5ca7d9c5c4ca5a6e755cf72f230176b6a ] If rxrpc_read() (which allows KEYCTL_READ to read a key), sees a token of a type it doesn't recognise, it can BUG in a couple of places, which is unnecessary as it can easily get back to userspace. Fix this to print an error message instead. Fixes: 99455153d067 ("RxRPC: Parse security index 5 keys (Kerberos 5)") Signed-off-by: David Howells Signed-off-by: Sasha Levin --- net/rxrpc/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index d77e89766406aa..32f46edcf7c67b 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -1108,7 +1108,8 @@ static long rxrpc_read(const struct key *key, break; default: /* we have a ticket we can't encode */ - BUG(); + pr_err("Unsupported key token type (%u)\n", + token->security_index); continue; } @@ -1224,7 +1225,6 @@ static long rxrpc_read(const struct key *key, break; default: - BUG(); break; } From 513dd1609c9d094b7863f02f05509db3c78adf20 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Oct 2020 11:57:40 +0100 Subject: [PATCH 72/86] rxrpc: Fix some missing _bh annotations on locking conn->state_lock [ Upstream commit fa1d113a0f96f9ab7e4fe4f8825753ba1e34a9d3 ] conn->state_lock may be taken in softirq mode, but a previous patch replaced an outer lock in the response-packet event handling code, and lost the _bh from that when doing so. Fix this by applying the _bh annotation to the state_lock locking. Fixes: a1399f8bb033 ("rxrpc: Call channels should have separate call number spaces") Signed-off-by: David Howells Signed-off-by: Sasha Levin --- net/rxrpc/conn_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 06fcff2ebbba1e..b5864683f200d4 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -341,18 +341,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return ret; spin_lock(&conn->channel_lock); - spin_lock(&conn->state_lock); + spin_lock_bh(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_bh(&conn->state_lock); for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure( rcu_dereference_protected( conn->channels[loop].call, lockdep_is_held(&conn->channel_lock))); } else { - spin_unlock(&conn->state_lock); + spin_unlock_bh(&conn->state_lock); } spin_unlock(&conn->channel_lock); From ae1a085b4aacda829f7a469c6d3d9d52e18a5e1d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 30 Sep 2020 19:52:08 +0100 Subject: [PATCH 73/86] rxrpc: The server keyring isn't network-namespaced [ Upstream commit fea99111244bae44e7d82a973744d27ea1567814 ] The keyring containing the server's tokens isn't network-namespaced, so it shouldn't be looked up with a network namespace. It is expected to be owned specifically by the server, so namespacing is unnecessary. Fixes: a58946c158a0 ("keys: Pass the network namespace into request_key mechanism") Signed-off-by: David Howells Signed-off-by: Sasha Levin --- net/rxrpc/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 32f46edcf7c67b..64cbbd2f16944e 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -941,7 +941,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (IS_ERR(description)) return PTR_ERR(description); - key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL); + key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); From b750f86a62d1a3bc75e17f9934eac6b225fd5387 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Oct 2020 14:04:51 +0100 Subject: [PATCH 74/86] rxrpc: Fix server keyring leak [ Upstream commit 38b1dc47a35ba14c3f4472138ea56d014c2d609b ] If someone calls setsockopt() twice to set a server key keyring, the first keyring is leaked. Fix it to return an error instead if the server key keyring is already set. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells Signed-off-by: Sasha Levin --- net/rxrpc/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 64cbbd2f16944e..85a9ff8cd236a0 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -903,7 +903,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) _enter(""); - if (optlen <= 0 || optlen > PAGE_SIZE - 1) + if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->securities) return -EINVAL; description = memdup_user_nul(optval, optlen); From 760c7a948bea9881598c528a4779aaa7f142d91b Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 27 Aug 2020 12:17:32 +0530 Subject: [PATCH 75/86] perf: Fix task_function_call() error handling [ Upstream commit 6d6b8b9f4fceab7266ca03d194f60ec72bd4b654 ] The error handling introduced by commit: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") looses any return value from smp_call_function_single() that is not {0, -EINVAL}. This is a problem because it will return -EXNIO when the target CPU is offline. Worse, in that case it'll turn into an infinite loop. Fixes: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") Reported-by: Srikar Dronamraju Signed-off-by: Kajol Jain Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Barret Rhoden Tested-by: Srikar Dronamraju Link: https://lkml.kernel.org/r/20200827064732.20860-1-kjain@linux.ibm.com Signed-off-by: Sasha Levin --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 47646050efa0cd..09e1cc22221fe4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -97,7 +97,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -113,7 +113,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret; if (ret != -EAGAIN) break; From d94c1505fa919759ad71f4e3aecbe198445fed0e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 2 Oct 2020 09:38:52 +0800 Subject: [PATCH 76/86] mmc: core: don't set limits.discard_granularity as 0 [ Upstream commit 4243219141b67d7c2fdb2d8073c17c539b9263eb ] In mmc_queue_setup_discard() the mmc driver queue's discard_granularity might be set as 0 (when card->pref_erase > max_discard) while the mmc device still declares to support discard operation. This is buggy and triggered the following kernel warning message, WARNING: CPU: 0 PID: 135 at __blkdev_issue_discard+0x200/0x294 CPU: 0 PID: 135 Comm: f2fs_discard-17 Not tainted 5.9.0-rc6 #1 Hardware name: Google Kevin (DT) pstate: 00000005 (nzcv daif -PAN -UAO BTYPE=--) pc : __blkdev_issue_discard+0x200/0x294 lr : __blkdev_issue_discard+0x54/0x294 sp : ffff800011dd3b10 x29: ffff800011dd3b10 x28: 0000000000000000 x27: ffff800011dd3cc4 x26: ffff800011dd3e18 x25: 000000000004e69b x24: 0000000000000c40 x23: ffff0000f1deaaf0 x22: ffff0000f2849200 x21: 00000000002734d8 x20: 0000000000000008 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000394 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 00000000000008b0 x9 : ffff800011dd3cb0 x8 : 000000000004e69b x7 : 0000000000000000 x6 : ffff0000f1926400 x5 : ffff0000f1940800 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 0000000000000008 x1 : 00000000002734d8 x0 : 0000000000000000 Call trace: __blkdev_issue_discard+0x200/0x294 __submit_discard_cmd+0x128/0x374 __issue_discard_cmd_orderly+0x188/0x244 __issue_discard_cmd+0x2e8/0x33c issue_discard_thread+0xe8/0x2f0 kthread+0x11c/0x120 ret_from_fork+0x10/0x1c ---[ end trace e4c8023d33dfe77a ]--- This patch fixes the issue by setting discard_granularity as SECTOR_SIZE instead of 0 when (card->pref_erase > max_discard) is true. Now no more complain from __blkdev_issue_discard() for the improper value of discard granularity. This issue is exposed after commit b35fd7422c2f ("block: check queue's limits.discard_granularity in __blkdev_issue_discard()"), a "Fixes:" tag is also added for the commit to make sure people won't miss this patch after applying the change of __blkdev_issue_discard(). Fixes: e056a1b5b67b ("mmc: queue: let host controllers specify maximum discard timeout") Fixes: b35fd7422c2f ("block: check queue's limits.discard_granularity in __blkdev_issue_discard()"). Reported-and-tested-by: Vicente Bergas Signed-off-by: Coly Li Acked-by: Adrian Hunter Cc: Ulf Hansson Link: https://lore.kernel.org/r/20201002013852.51968-1-colyli@suse.de Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/core/queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 9c0ccb3744c287..81b8d5ede484ee 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -184,7 +184,7 @@ static void mmc_queue_setup_discard(struct request_queue *q, q->limits.discard_granularity = card->pref_erase << 9; /* granularity must not be greater than max. discard */ if (card->pref_erase > max_discard) - q->limits.discard_granularity = 0; + q->limits.discard_granularity = SECTOR_SIZE; if (mmc_can_secure_erase_trim(card)) blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); } From 2729afe17987d55845fee512e3d713bc4724c553 Mon Sep 17 00:00:00 2001 From: Vijay Balakrishna Date: Sat, 10 Oct 2020 23:16:40 -0700 Subject: [PATCH 77/86] mm: khugepaged: recalculate min_free_kbytes after memory hotplug as expected by khugepaged commit 4aab2be0983031a05cb4a19696c9da5749523426 upstream. When memory is hotplug added or removed the min_free_kbytes should be recalculated based on what is expected by khugepaged. Currently after hotplug, min_free_kbytes will be set to a lower default and higher default set when THP enabled is lost. This change restores min_free_kbytes as expected for THP consumers. [vijayb@linux.microsoft.com: v5] Link: https://lkml.kernel.org/r/1601398153-5517-1-git-send-email-vijayb@linux.microsoft.com Fixes: f000565adb77 ("thp: set recommended min free kbytes") Signed-off-by: Vijay Balakrishna Signed-off-by: Andrew Morton Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Allen Pais Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Song Liu Cc: Link: https://lkml.kernel.org/r/1600305709-2319-2-git-send-email-vijayb@linux.microsoft.com Link: https://lkml.kernel.org/r/1600204258-13683-1-git-send-email-vijayb@linux.microsoft.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/khugepaged.h | 5 +++++ mm/khugepaged.c | 13 +++++++++++-- mm/page_alloc.c | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index bc45ea1efbf797..c941b73773216f 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); #else @@ -85,6 +86,10 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { } + +static inline void khugepaged_min_free_kbytes_update(void) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index be897e45effbfa..f0d7e6483ba326 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -54,6 +54,9 @@ enum scan_result { #define CREATE_TRACE_POINTS #include +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; @@ -2177,8 +2180,6 @@ static void set_recommended_min_free_kbytes(void) int start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; - static DEFINE_MUTEX(khugepaged_mutex); int err = 0; mutex_lock(&khugepaged_mutex); @@ -2205,3 +2206,11 @@ int start_stop_khugepaged(void) mutex_unlock(&khugepaged_mutex); return err; } + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 373ca578075896..aff0bb4629bdf4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -7870,6 +7871,8 @@ int __meminit init_per_zone_wmark_min(void) setup_min_slab_ratio(); #endif + khugepaged_min_free_kbytes_update(); + return 0; } postcore_initcall(init_per_zone_wmark_min) From e39c9eba9bef0e1dd01d744360660873c10d9d3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Oct 2020 06:48:13 -0700 Subject: [PATCH 78/86] tcp: fix receive window update in tcp_add_backlog() commit 86bccd0367130f481ca99ba91de1c6a5aa1c78c1 upstream. We got reports from GKE customers flows being reset by netfilter conntrack unless nf_conntrack_tcp_be_liberal is set to 1. Traces seemed to suggest ACK packet being dropped by the packet capture, or more likely that ACK were received in the wrong order. wscale=7, SYN and SYNACK not shown here. This ACK allows the sender to send 1871*128 bytes from seq 51359321 : New right edge of the window -> 51359321+1871*128=51598809 09:17:23.389210 IP A > B: Flags [.], ack 51359321, win 1871, options [nop,nop,TS val 10 ecr 999], length 0 09:17:23.389212 IP B > A: Flags [.], seq 51422681:51424089, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 1408 09:17:23.389214 IP A > B: Flags [.], ack 51422681, win 1376, options [nop,nop,TS val 10 ecr 999], length 0 09:17:23.389253 IP B > A: Flags [.], seq 51424089:51488857, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 64768 09:17:23.389272 IP A > B: Flags [.], ack 51488857, win 859, options [nop,nop,TS val 10 ecr 999], length 0 09:17:23.389275 IP B > A: Flags [.], seq 51488857:51521241, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 32384 Receiver now allows to send 606*128=77568 from seq 51521241 : New right edge of the window -> 51521241+606*128=51598809 09:17:23.389296 IP A > B: Flags [.], ack 51521241, win 606, options [nop,nop,TS val 10 ecr 999], length 0 09:17:23.389308 IP B > A: Flags [.], seq 51521241:51553625, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 32384 It seems the sender exceeds RWIN allowance, since 51611353 > 51598809 09:17:23.389346 IP B > A: Flags [.], seq 51553625:51611353, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 57728 09:17:23.389356 IP B > A: Flags [.], seq 51611353:51618393, ack 1577, win 268, options [nop,nop,TS val 999 ecr 10], length 7040 09:17:23.389367 IP A > B: Flags [.], ack 51611353, win 0, options [nop,nop,TS val 10 ecr 999], length 0 netfilter conntrack is not happy and sends RST 09:17:23.389389 IP A > B: Flags [R], seq 92176528, win 0, length 0 09:17:23.389488 IP B > A: Flags [R], seq 174478967, win 0, length 0 Now imagine ACK were delivered out of order and tcp_add_backlog() sets window based on wrong packet. New right edge of the window -> 51521241+859*128=51631193 Normally TCP stack handles OOO packets just fine, but it turns out tcp_add_backlog() does not. It can update the window field of the aggregated packet even if the ACK sequence of the last received packet is too old. Many thanks to Alexandre Ferrieux for independently reporting the issue and suggesting a fix. Fixes: 4f693b55c3d2 ("tcp: implement coalescing on backlog queue") Signed-off-by: Eric Dumazet Reported-by: Alexandre Ferrieux Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35f963690a70ef..87a5037a9cb3ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1719,12 +1719,12 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, hdrlen); if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { - thtail->window = th->window; - TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; - if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + thtail->window = th->window; + } /* We have to update both TCP_SKB_CB(tail)->tcp_flags and * thtail->fin, so that the fast path in tcp_rcv_established() From a42dbd059ef6616e009d3f9d7e6c92e93d9eb27b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 2 Oct 2020 21:53:08 +0200 Subject: [PATCH 79/86] net/core: check length before updating Ethertype in skb_mpls_{push,pop} commit 4296adc3e32f5d544a95061160fe7e127be1b9ff upstream. Openvswitch allows to drop a packet's Ethernet header, therefore skb_mpls_push() and skb_mpls_pop() might be called with ethernet=true and mac_len=0. In that case the pointer passed to skb_mod_eth_type() doesn't point to an Ethernet header and the new Ethertype is written at unexpected locations. Fix this by verifying that mac_len is big enough to contain an Ethernet header. Fixes: fa4e0f8855fc ("net/sched: fix corrupted L2 header with MPLS 'push' and 'pop' actions") Signed-off-by: Guillaume Nault Acked-by: Davide Caratti Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 08d9915d50c0fb..466d6273da9f2c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5515,7 +5515,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, lse->label_stack_entry = mpls_lse; skb_postpush_rcsum(skb, lse, MPLS_HLEN); - if (ethernet) + if (ethernet && mac_len >= ETH_HLEN) skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); skb->protocol = mpls_proto; @@ -5555,7 +5555,7 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); - if (ethernet) { + if (ethernet && mac_len >= ETH_HLEN) { struct ethhdr *hdr; /* use mpls_hdr() to get ethertype to account for VLANs. */ From 8dc5025c6a444548ab5e2ef9e6f4479f71c7c1a0 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Thu, 24 Sep 2020 12:28:45 +0530 Subject: [PATCH 80/86] net/tls: race causes kernel panic commit 38f7e1c0c43dd25b06513137bb6fd35476f9ec6d upstream. BUG: kernel NULL pointer dereference, address: 00000000000000b8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 80000008b6fef067 P4D 80000008b6fef067 PUD 8b6fe6067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 12 PID: 23871 Comm: kworker/12:80 Kdump: loaded Tainted: G S 5.9.0-rc3+ #1 Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.1 03/29/2018 Workqueue: events tx_work_handler [tls] RIP: 0010:tx_work_handler+0x1b/0x70 [tls] Code: dc fe ff ff e8 16 d4 a3 f6 66 0f 1f 44 00 00 0f 1f 44 00 00 55 53 48 8b 6f 58 48 8b bd a0 04 00 00 48 85 ff 74 1c 48 8b 47 28 <48> 8b 90 b8 00 00 00 83 e2 02 75 0c f0 48 0f ba b0 b8 00 00 00 00 RSP: 0018:ffffa44ace61fe88 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff91da9e45cc30 RCX: dead000000000122 RDX: 0000000000000001 RSI: ffff91da9e45cc38 RDI: ffff91d95efac200 RBP: ffff91da133fd780 R08: 0000000000000000 R09: 000073746e657665 R10: 8080808080808080 R11: 0000000000000000 R12: ffff91dad7d30700 R13: ffff91dab6561080 R14: 0ffff91dad7d3070 R15: ffff91da9e45cc38 FS: 0000000000000000(0000) GS:ffff91dad7d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000b8 CR3: 0000000906478003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: process_one_work+0x1a7/0x370 worker_thread+0x30/0x370 ? process_one_work+0x370/0x370 kthread+0x114/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x22/0x30 tls_sw_release_resources_tx() waits for encrypt_pending, which can have race, so we need similar changes as in commit 0cada33241d9de205522e3858b18e506ca5cce2c here as well. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Signed-off-by: Rohit Maheshwari Acked-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_sw.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 70b203e5d5fd0e..515d295309a861 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2137,10 +2137,15 @@ void tls_sw_release_resources_tx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; + int pending; /* Wait for any pending async encryptions to complete */ - smp_store_mb(ctx->async_notify, true); - if (atomic_read(&ctx->encrypt_pending)) + spin_lock_bh(&ctx->encrypt_compl_lock); + ctx->async_notify = true; + pending = atomic_read(&ctx->encrypt_pending); + spin_unlock_bh(&ctx->encrypt_compl_lock); + + if (pending) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); tls_tx_records(sk, -1); From 536c767b14e394eb8ccead829e7f35428c0f13ef Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 9 Aug 2020 12:34:21 +0300 Subject: [PATCH 81/86] net/mlx5e: Fix driver's declaration to support GRE offload commit 3d093bc2369003b4ce6c3522d9b383e47c40045d upstream. Declare GRE offload support with respect to the inner protocol. Add a list of supported inner protocols on which the driver can offload checksum and GSO. For other protocols, inform the stack to do the needed operations. There is no noticeable impact on GRE performance. Fixes: 2729984149e6 ("net/mlx5e: Support TSO and TX checksum offloads for GRE tunnels") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index fc710648d2ef37..8b8581f71e7936 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4328,6 +4328,21 @@ void mlx5e_del_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti) mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0); } +static bool mlx5e_gre_tunnel_inner_proto_offload_supported(struct mlx5_core_dev *mdev, + struct sk_buff *skb) +{ + switch (skb->inner_protocol) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TEB): + return true; + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): + return MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre); + } + return false; +} + static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, struct sk_buff *skb, netdev_features_t features) @@ -4350,7 +4365,9 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, switch (proto) { case IPPROTO_GRE: - return features; + if (mlx5e_gre_tunnel_inner_proto_offload_supported(priv->mdev, skb)) + return features; + break; case IPPROTO_IPIP: case IPPROTO_IPV6: if (mlx5e_tunnel_proto_supported(priv->mdev, IPPROTO_IPIP)) From 6c9edf2d855a3456abce8ee1603a4114337ce3ee Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 21 Jul 2020 22:24:24 -0700 Subject: [PATCH 82/86] Input: ati_remote2 - add missing newlines when printing module parameters commit 37bd9e803daea816f2dc2c8f6dc264097eb3ebd2 upstream. When I cat some module parameters by sysfs, it displays as follows. It's better to add a newline for easy reading. root@syzkaller:~# cat /sys/module/ati_remote2/parameters/mode_mask 0x1froot@syzkaller:~# cat /sys/module/ati_remote2/parameters/channel_mask 0xffffroot@syzkaller:~# Signed-off-by: Xiongfeng Wang Link: https://lore.kernel.org/r/20200720092148.9320-1-wangxiongfeng2@huawei.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/misc/ati_remote2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/misc/ati_remote2.c b/drivers/input/misc/ati_remote2.c index 305f0160506a04..8a36d78fed63ad 100644 --- a/drivers/input/misc/ati_remote2.c +++ b/drivers/input/misc/ati_remote2.c @@ -68,7 +68,7 @@ static int ati_remote2_get_channel_mask(char *buffer, { pr_debug("%s()\n", __func__); - return sprintf(buffer, "0x%04x", *(unsigned int *)kp->arg); + return sprintf(buffer, "0x%04x\n", *(unsigned int *)kp->arg); } static int ati_remote2_set_mode_mask(const char *val, @@ -84,7 +84,7 @@ static int ati_remote2_get_mode_mask(char *buffer, { pr_debug("%s()\n", __func__); - return sprintf(buffer, "0x%02x", *(unsigned int *)kp->arg); + return sprintf(buffer, "0x%02x\n", *(unsigned int *)kp->arg); } static unsigned int channel_mask = ATI_REMOTE2_MAX_CHANNEL_MASK; From dbb763107d3ee8ecfe1fc0e262158339e9875892 Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Mon, 5 Oct 2020 18:59:58 +0530 Subject: [PATCH 83/86] net: usb: rtl8150: set random MAC address when set_ethernet_addr() fails commit f45a4248ea4cc13ed50618ff066849f9587226b2 upstream. When get_registers() fails in set_ethernet_addr(),the uninitialized value of node_id gets copied over as the address. So, check the return value of get_registers(). If get_registers() executed successfully (i.e., it returns sizeof(node_id)), copy over the MAC address using ether_addr_copy() (instead of using memcpy()). Else, if get_registers() failed instead, a randomly generated MAC address is set as the MAC address instead. Reported-by: syzbot+abbc768b560c84d92fd3@syzkaller.appspotmail.com Tested-by: syzbot+abbc768b560c84d92fd3@syzkaller.appspotmail.com Acked-by: Petko Manolov Signed-off-by: Anant Thazhemadam Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/rtl8150.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 13e51ccf021473..491625c1c30849 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -274,12 +274,20 @@ static int write_mii_word(rtl8150_t * dev, u8 phy, __u8 indx, u16 reg) return 1; } -static inline void set_ethernet_addr(rtl8150_t * dev) +static void set_ethernet_addr(rtl8150_t *dev) { - u8 node_id[6]; + u8 node_id[ETH_ALEN]; + int ret; + + ret = get_registers(dev, IDR, sizeof(node_id), node_id); - get_registers(dev, IDR, sizeof(node_id), node_id); - memcpy(dev->netdev->dev_addr, node_id, sizeof(node_id)); + if (ret == sizeof(node_id)) { + ether_addr_copy(dev->netdev->dev_addr, node_id); + } else { + eth_hw_addr_random(dev->netdev); + netdev_notice(dev->netdev, "Assigned a random MAC address: %pM\n", + dev->netdev->dev_addr); + } } static int rtl8150_set_mac_address(struct net_device *netdev, void *p) From a5de4ee6d055899b67bfb41722bbeb5c6e0fba70 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 22 Sep 2020 20:56:23 -0700 Subject: [PATCH 84/86] net_sched: defer tcf_idr_insert() in tcf_action_init_1() commit e49d8c22f1261c43a986a7fdbf677ac309682a07 upstream. All TC actions call tcf_idr_insert() for new action at the end of their ->init(), so we can actually move it to a central place in tcf_action_init_1(). And once the action is inserted into the global IDR, other parallel process could free it immediately as its refcnt is still 1, so we can not fail after this, we need to move it after the goto action validation to avoid handling the failure case after insertion. This is found during code review, is not directly triggered by syzbot. And this prepares for the next patch. Cc: Vlad Buslov Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/act_api.h | 2 -- net/sched/act_api.c | 38 ++++++++++++++++++++------------------ net/sched/act_bpf.c | 4 +--- net/sched/act_connmark.c | 1 - net/sched/act_csum.c | 3 --- net/sched/act_ct.c | 2 -- net/sched/act_ctinfo.c | 3 --- net/sched/act_gact.c | 2 -- net/sched/act_ife.c | 3 --- net/sched/act_ipt.c | 2 -- net/sched/act_mirred.c | 2 -- net/sched/act_mpls.c | 2 -- net/sched/act_nat.c | 3 --- net/sched/act_pedit.c | 2 -- net/sched/act_police.c | 2 -- net/sched/act_sample.c | 2 -- net/sched/act_simple.c | 2 -- net/sched/act_skbedit.c | 2 -- net/sched/act_skbmod.c | 2 -- net/sched/act_tunnel_key.c | 3 --- net/sched/act_vlan.c | 2 -- 21 files changed, 21 insertions(+), 63 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index 59d05feecfb8a9..05b568b92e59d6 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -156,8 +156,6 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats); -void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); - void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 69d4676a402f54..fea74060d2c3a6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -451,17 +451,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, } EXPORT_SYMBOL(tcf_idr_create); -void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) -{ - struct tcf_idrinfo *idrinfo = tn->idrinfo; - - mutex_lock(&idrinfo->lock); - /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ - WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); - mutex_unlock(&idrinfo->lock); -} -EXPORT_SYMBOL(tcf_idr_insert); - /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) @@ -839,6 +828,16 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, }; +static void tcf_idr_insert(struct tc_action *a) +{ + struct tcf_idrinfo *idrinfo = a->idrinfo; + + mutex_lock(&idrinfo->lock); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); + mutex_unlock(&idrinfo->lock); +} + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -921,6 +920,16 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) goto err_mod; + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && + !rcu_access_pointer(a->goto_chain)) { + tcf_action_destroy_1(a, bind); + NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); + return ERR_PTR(-EINVAL); + } + + if (err == ACT_P_CREATED) + tcf_idr_insert(a); + if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@ -931,13 +940,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err != ACT_P_CREATED) module_put(a_o->owner); - if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && - !rcu_access_pointer(a->goto_chain)) { - tcf_action_destroy_1(a, bind); - NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); - return ERR_PTR(-EINVAL); - } - return a; err_mod: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 04b7bd4ec7513c..b7d83d01841b14 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -361,9 +361,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (res == ACT_P_CREATED) { - tcf_idr_insert(tn, *act); - } else { + if (res != ACT_P_CREATED) { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 1a8f2f85ea1ab4..b00105e623341a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -139,7 +139,6 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci->net = net; ci->zone = parm->zone; - tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 428b1ae00123d2..fa1b1fd10c4411 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -110,9 +110,6 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (params_new) kfree_rcu(params_new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index e32c4732ddf831..6119c31dcd0725 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -740,8 +740,6 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, tcf_chain_put_by_act(goto_ch); if (params) call_rcu(¶ms->rcu, tcf_ct_params_free); - if (res == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return res; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index a91fcee810efa7..9722204399a421 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -269,9 +269,6 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, if (cp_new) kfree_rcu(cp_new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 324f1d1f6d4779..faf68a44b8451d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -139,8 +139,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; release_idr: tcf_idr_release(*a, bind); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 778371bac93e24..488d10476e850d 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -626,9 +626,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; metadata_parse_err: if (goto_ch) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 214a03d405cf9b..02b0cb67643e9d 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -189,8 +189,6 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, ipt->tcfi_t = t; ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; err3: diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 27f624971121be..8327ef9793ef85 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -194,8 +194,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); - - tcf_idr_insert(tn, *a); } return ret; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index f786775699b518..f496c99d982664 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -273,8 +273,6 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index ea4c5359e7dfb5..8b5d2360a4dc27 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -93,9 +93,6 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; release_idr: tcf_idr_release(*a, bind); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b5bc631b96b7e1..ff4f2437b59258 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -237,8 +237,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 89c04c52af3dab..8fd23a8b88a5e7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -201,8 +201,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (new) kfree_rcu(new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; failure: diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 514456a0b9a832..74450b0f69fc52 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -116,8 +116,6 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6120e56117ca14..6b0617de71c00b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -156,8 +156,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, goto release_idr; } - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index f98b2791ecec44..f736da513d53a9 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -214,8 +214,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 888437f97ba620..e858a0a9c04575 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -190,8 +190,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d55669e1474171..bdaa04a9a7fa42 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -392,9 +392,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 08aaf719a70fbb..3c26042f4ea6d7 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -228,8 +228,6 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) From 22e6625babfc1fe349032cd46237085b91aae076 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 22 Sep 2020 20:56:24 -0700 Subject: [PATCH 85/86] net_sched: commit action insertions together commit 0fedc63fadf0404a729e73a35349481c8009c02f upstream. syzbot is able to trigger a failure case inside the loop in tcf_action_init(), and when this happens we clean up with tcf_action_destroy(). But, as these actions are already inserted into the global IDR, other parallel process could free them before tcf_action_destroy(), then we will trigger a use-after-free. Fix this by deferring the insertions even later, after the loop, and committing all the insertions in a separate loop, so we will never fail in the middle of the insertions any more. One side effect is that the window between alloction and final insertion becomes larger, now it is more likely that the loop in tcf_del_walker() sees the placeholder -EBUSY pointer. So we have to check for error pointer in tcf_del_walker(). Reported-and-tested-by: syzbot+2287853d392e4b42374a@syzkaller.appspotmail.com Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Cc: Vlad Buslov Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_api.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fea74060d2c3a6..4a5ef2adb2e575 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -303,6 +303,8 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { + if (IS_ERR(p)) + continue; ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); @@ -828,14 +830,24 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, }; -static void tcf_idr_insert(struct tc_action *a) +static void tcf_idr_insert_many(struct tc_action *actions[]) { - struct tcf_idrinfo *idrinfo = a->idrinfo; + int i; - mutex_lock(&idrinfo->lock); - /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ - WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); - mutex_unlock(&idrinfo->lock); + for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { + struct tc_action *a = actions[i]; + struct tcf_idrinfo *idrinfo; + + if (!a) + continue; + idrinfo = a->idrinfo; + mutex_lock(&idrinfo->lock); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc if + * it is just created, otherwise this is just a nop. + */ + idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + mutex_unlock(&idrinfo->lock); + } } struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -927,9 +939,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, return ERR_PTR(-EINVAL); } - if (err == ACT_P_CREATED) - tcf_idr_insert(a); - if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@ -983,6 +992,11 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, actions[i - 1] = act; } + /* We have to commit them all together, because if any error happened in + * between, we could not handle the failure gracefully. + */ + tcf_idr_insert_many(actions); + *attr_size = tcf_action_full_attrs_size(sz); return i - 1; From 85b0841aab15c12948af951d477183ab3df7de14 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 14 Oct 2020 10:33:07 +0200 Subject: [PATCH 86/86] Linux 5.4.71 Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Shuah Khan Link: https://lore.kernel.org/r/20201012132632.846779148@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e409fd909560f7..f342e64c8c1d19 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 70 +SUBLEVEL = 71 EXTRAVERSION = NAME = Kleptomaniac Octopus