[PATCH V3 1/8] fs/ext4: Narrow scope of DAX check in setflags
From: Ira Weiny When preventing DAX and journaling on an inode. Use the effective DAX check rather than the mount option. This will be required to support per inode DAX flags. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- fs/ext4/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..5813e5e73eab 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -393,9 +393,9 @@ static int ext4_ioctl_setflags(struct inode *inode, if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to -* S_DAX if we are using the DAX mount option. +* S_DAX if the inode is DAX */ - if (test_opt(inode->i_sb, DAX)) { + if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } -- 2.25.1
[PATCH V3 5/8] fs/ext4: Only change S_DAX on inode load
From: Ira Weiny To prevent complications with in memory inodes we only set S_DAX on inode load. FS_XFLAG_DAX can be changed at any time and S_DAX will change after inode eviction and reload. Add init bool to ext4_set_inode_flags() to indicate if the inode is being newly initialized. Assert that S_DAX is not set on an inode which is just being loaded. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes from V2: Rework based on moving the encryption patch to the end. Changes from RFC: Change J_ASSERT() to WARN_ON_ONCE() Fix bug which would clear S_DAX incorrectly --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 13 ++--- fs/ext4/ioctl.c | 3 ++- fs/ext4/super.c | 4 ++-- fs/ext4/verity.c | 2 +- 6 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1a3daf2d18ef..86a0994332ce 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2692,7 +2692,7 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern void ext4_set_inode_flags(struct inode *); +extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..7941c140723f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1116,7 +1116,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei->i_block_group = group; ei->i_last_alloc_group = ~0; - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3a4c2ed7a1c..23e42a223235 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4419,11 +4419,13 @@ static bool ext4_should_enable_dax(struct inode *inode) return false; } -void ext4_set_inode_flags(struct inode *inode) +void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; + WARN_ON_ONCE(IS_DAX(inode) && init); + if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) @@ -4434,8 +4436,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_enable_dax(inode)) + + /* Because of the way inode_set_flags() works we must preserve S_DAX +* here if already set. */ + new_fl |= (inode->i_flags & S_DAX); + if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) @@ -4649,7 +4656,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5813e5e73eab..145083e8cd1e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -381,7 +381,8 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_clear_inode_flag(inode, i); } - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, ); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b99c44d0a91..3cb9b48d3cc4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1348,7 +1348,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1375,7 +1375,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res =
[PATCH V3 6/8] fs/ext4: Make DAX mount option a tri-state
From: Ira Weiny We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. This new functionality is limited to ext4 only. Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode. We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX. Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user specified that option for printing. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes from V1: Fix up mounting options to only show an option if specified Fix remount to prevent dax changes Isolate behavior to ext4 only Changes from RFC: Combine remount check for DAX_NEVER with DAX_ALWAYS Update ext4_should_enable_dax() --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 2 ++ fs/ext4/super.c | 67 + 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 86a0994332ce..6235440e4c39 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1168,6 +1168,8 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT0x0004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_DAX_NEVER 0x0008 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x0010 /* For printing options only */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x0008 /* User explicitly specified journal checksum */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 23e42a223235..140b1930e2f4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4400,6 +4400,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) static bool ext4_should_enable_dax(struct inode *inode) { + if (test_opt2(inode->i_sb, DAX_NEVER)) + return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3cb9b48d3cc4..5ba65eb0e2ef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1512,7 +1512,8 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1579,6 +1580,9 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1726,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING0x0400 +#define MOPT_SKIP 0x0800 static const struct mount_opts { int token; @@ -1775,7 +1780,13 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -2084,13 +2095,32 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + switch (token) { + case Opt_dax: + case Opt_dax_always:
[PATCH V3 4/8] fs/ext4: Update ext4_should_use_dax()
From: Ira Weiny S_DAX should only be enabled when the underlying block device supports dax. Change ext4_should_use_dax() to check for device support prior to the over riding mount option. While we are at it change the function to ext4_should_enable_dax() as this better reflects the ask as well as matches xfs. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes from RFC Change function name to 'should enable' Clean up bool conversion Reorder this for better bisect-ability --- fs/ext4/inode.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a10ff12194db..d3a4c2ed7a1c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4398,10 +4398,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } -static bool ext4_should_use_dax(struct inode *inode) +static bool ext4_should_enable_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX_ALWAYS)) - return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) @@ -4412,7 +4410,13 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; - return true; + if (!bdev_dax_supported(inode->i_sb->s_bdev, + inode->i_sb->s_blocksize)) + return false; + if (test_opt(inode->i_sb, DAX_ALWAYS)) + return true; + + return false; } void ext4_set_inode_flags(struct inode *inode) @@ -4430,7 +4434,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_use_dax(inode)) + if (ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; -- 2.25.1
[PATCH V3 2/8] fs/ext4: Disallow verity if inode is DAX
From: Ira Weiny Verity and DAX are incompatible. Changing the DAX mode due to a verity flag change is wrong without a corresponding address_space_operations update. Make the 2 options mutually exclusive by returning an error if DAX was set first. (Setting DAX is already disabled if Verity is set first.) Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes from V2: Remove Section title 'Verity and DAX' Changes: remove WARN_ON_ONCE Add documentation for DAX/Verity exclusivity --- Documentation/filesystems/ext4/verity.rst | 3 +++ fs/ext4/verity.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Documentation/filesystems/ext4/verity.rst b/Documentation/filesystems/ext4/verity.rst index 3e4c0ee0e068..e99ff3fd09f7 100644 --- a/Documentation/filesystems/ext4/verity.rst +++ b/Documentation/filesystems/ext4/verity.rst @@ -39,3 +39,6 @@ is encrypted as well as the data itself. Verity files cannot have blocks allocated past the end of the verity metadata. + +Verity and DAX are not compatible and attempts to set both of these flags +on a file will fail. diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..f05a09fb2ae4 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; + if (IS_DAX(inode)) + return -EINVAL; + if (ext4_verity_in_progress(inode)) return -EBUSY; -- 2.25.1
[PATCH V3 8/8] Documentation/dax: Update DAX enablement for ext4
From: Ira Weiny Update the document to reflect ext4 and xfs now behave the same. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes from RFC: Update with ext2 text... --- Documentation/filesystems/dax.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 735fb4b54117..265c4f808dbf 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -25,7 +25,7 @@ size when creating the filesystem. Currently 3 filesystems support DAX: ext2, ext4 and xfs. Enabling DAX on them is different. -Enabling DAX on ext4 and ext2 +Enabling DAX on ext2 - When mounting the filesystem, use the "-o dax" option on the command line or @@ -33,8 +33,8 @@ add 'dax' to the options in /etc/fstab. This works to enable DAX on all files within the filesystem. It is equivalent to the '-o dax=always' behavior below. -Enabling DAX on xfs +Enabling DAX on xfs and ext4 + Summary --- -- 2.25.1
[PATCH V3 0/8] Enable ext4 support for per-file/directory DAX operations
From: Ira Weiny Changes from V2: Rework DAX exclusivity with verity and encryption based on feedback from Eric Enable the same per file DAX support in ext4 as was done for xfs. This series builds and depends on the V11 series for xfs.[1] This passes the same xfstests test as XFS. The only issue is that this modifies the old mount option parsing code rather than waiting for the new parsing code to be finalized. This series starts with 3 fixes which include making Verity and Encrypt truly mutually exclusive from DAX. I think these first 3 patches should be picked up for 5.8 regardless of what is decided regarding the mount parsing. [1] https://lore.kernel.org/lkml/20200428002142.404144-1-ira.we...@intel.com/ To: linux-kernel@vger.kernel.org Cc: "Darrick J. Wong" Cc: Dan Williams Cc: Dave Chinner Cc: Christoph Hellwig Cc: "Theodore Y. Ts'o" Cc: Jan Kara Cc: linux-e...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Ira Weiny (8): fs/ext4: Narrow scope of DAX check in setflags fs/ext4: Disallow verity if inode is DAX fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS fs/ext4: Update ext4_should_use_dax() fs/ext4: Only change S_DAX on inode load fs/ext4: Make DAX mount option a tri-state fs/ext4: Introduce DAX inode flag Documentation/dax: Update DAX enablement for ext4 Documentation/filesystems/dax.txt | 6 +- Documentation/filesystems/ext4/verity.rst | 3 + fs/ext4/ext4.h| 22 +-- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 25 +-- fs/ext4/ioctl.c | 41 ++-- fs/ext4/super.c | 80 ++- fs/ext4/verity.c | 5 +- include/uapi/linux/fs.h | 1 + 9 files changed, 148 insertions(+), 37 deletions(-) -- 2.25.1
[PATCH V3 3/8] fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS
From: Ira Weiny In prep for the new tri-state mount option which then introduces EXT4_MOUNT_DAX_NEVER. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny --- Changes: New patch --- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/super.c | 12 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..1a3daf2d18ef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1123,9 +1123,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_MINIX_DF0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX -#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else -#define EXT4_MOUNT_DAX 0 +#define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA0x00400 /* Write data to journal */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..a10ff12194db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4400,7 +4400,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) static bool ext4_should_use_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX)) + if (!test_opt(inode->i_sb, DAX_ALWAYS)) return false; if (!S_ISREG(inode->i_mode)) return false; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..7b99c44d0a91 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1775,7 +1775,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -3982,7 +3982,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4092,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); @@ -5412,7 +5412,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); err = -EINVAL; @@ -5433,10 +5433,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX; + sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- 2.25.1
[PATCH V3 7/8] fs/ext4: Introduce DAX inode flag
From: Ira Weiny Add a flag to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time with the exception of if VERITY or ENCRYPT is set. Disallow setting VERITY or ENCRYPT if DAX is set. Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny --- Change from V2: Add in making verity and DAX exclusive. 'Squash' in making encryption and DAX exclusive. Add in EXT4_INODE_DAX flag definition to be compatible with ext4_[set|test]_inode_flag() bit operations Use ext4_[set|test]_inode_flag() bit operations to be consistent with other code. Change from V0: Add FS_DAX_FL to include/uapi/linux/fs.h to be consistent Move ext4_dax_dontcache() to ext4_ioctl_setflags() This ensures that it is only set when the flags are going to be set and not if there is an error Also this sets don't cache in the FS_IOC_SETFLAGS case Change from RFC: use new d_mark_dontcache() Allow caching if ALWAYS/NEVER is set Rebased to latest Linus master Change flag to unused 0x0100 update ext4_should_enable_dax() --- fs/ext4/ext4.h | 14 ++ fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 34 +- fs/ext4/super.c | 3 +++ fs/ext4/verity.c| 2 +- include/uapi/linux/fs.h | 1 + 6 files changed, 49 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6235440e4c39..467c30a789b6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -415,13 +415,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x0010 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x0020 /* Inode used for large EA */ /* 0x0040 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL0x0100 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL0x1000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL0x2000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x4000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x8000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x715BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE0x614BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -429,14 +432,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ -EXT4_PROJINHERIT_FL) +EXT4_PROJINHERIT_FL | \ +EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -488,6 +493,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 24, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 140b1930e2f4..ae61db8b8bae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4418,7 +4418,7 @@ static bool ext4_should_enable_dax(struct inode *inode) if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; - return false; + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index
Re: general protection fault in kobject_get (2)
On Tue, May 19, 2020 at 09:53:16PM -0700, syzbot wrote: > Hello, > > syzbot found the following crash on: > > HEAD commit:d00f26b6 Merge git://git.kernel.org/pub/scm/linux/kernel/g.. > git tree: net-next > console output: https://syzkaller.appspot.com/x/log.txt?x=1316343c10 > kernel config: https://syzkaller.appspot.com/x/.config?x=26d0bd769afe1a2c > dashboard link: https://syzkaller.appspot.com/bug?extid=407fd358a932bbf639c6 > compiler: gcc (GCC) 9.0.0 20181231 (experimental) > > Unfortunately, I don't have any reproducer for this crash yet. > > IMPORTANT: if you fix the bug, please add the following tag to the commit: > Reported-by: syzbot+407fd358a932bbf63...@syzkaller.appspotmail.com > > general protection fault, probably for non-canonical address > 0xdc13: [#1] PREEMPT SMP KASAN > KASAN: null-ptr-deref in range [0x0098-0x009f] > CPU: 1 PID: 16682 Comm: syz-executor.3 Not tainted 5.7.0-rc4-syzkaller #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS > Google 01/01/2011 > RIP: 0010:kobject_get+0x30/0x150 lib/kobject.c:640 > Code: 53 e8 d4 7e c6 fd 4d 85 e4 0f 84 a2 00 00 00 e8 c6 7e c6 fd 49 8d 7c 24 > 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa > 83 e2 07 38 d0 7f 08 84 c0 0f 85 e7 00 00 00 > RSP: 0018:c9000772f240 EFLAGS: 00010203 > RAX: dc00 RBX: 85acfca0 RCX: c9000fc67000 > RDX: 0013 RSI: 83acadfa RDI: 009c > RBP: 0060 R08: 8880a8dfa4c0 R09: ed100a03f403 > R10: 8880501fa017 R11: ed100a03f402 R12: 0060 > R13: c9000772f3c0 R14: 88805d1ec4e8 R15: 88805d1ec580 > FS: 7f1ebed26700() GS:8880ae70() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 004d88f0 CR3: a86c4000 CR4: 001406e0 > DR0: DR1: DR2: > DR3: DR6: fffe0ff0 DR7: 0400 > Call Trace: > get_device+0x20/0x30 drivers/base/core.c:2620 > __ib_get_client_nl_info+0x1d4/0x2a0 drivers/infiniband/core/device.c:1863 > ib_get_client_nl_info+0x30/0x180 drivers/infiniband/core/device.c:1883 > nldev_get_chardev+0x52b/0xa40 drivers/infiniband/core/nldev.c:1625 > rdma_nl_rcv_msg drivers/infiniband/core/netlink.c:195 [inline] > rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] > rdma_nl_rcv+0x586/0x900 drivers/infiniband/core/netlink.c:259 > netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] > netlink_unicast+0x537/0x740 net/netlink/af_netlink.c:1329 > netlink_sendmsg+0x882/0xe10 net/netlink/af_netlink.c:1918 > sock_sendmsg_nosec net/socket.c:652 [inline] > sock_sendmsg+0xcf/0x120 net/socket.c:672 > sys_sendmsg+0x6e6/0x810 net/socket.c:2352 > ___sys_sendmsg+0x100/0x170 net/socket.c:2406 > __sys_sendmsg+0xe5/0x1b0 net/socket.c:2439 > do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 > entry_SYSCALL_64_after_hwframe+0x49/0xb3 > RIP: 0033:0x45c829 > Code: 0d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 > 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f > 83 db b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 > RSP: 002b:7f1ebed25c78 EFLAGS: 0246 ORIG_RAX: 002e > RAX: ffda RBX: 004ff720 RCX: 0045c829 > RDX: RSI: 2200 RDI: 0003 > RBP: 0078bf00 R08: R09: > R10: R11: 0246 R12: > R13: 09ad R14: 004d5f10 R15: 7f1ebed266d4 > Modules linked in: > ---[ end trace 239938a6c4c3c99f ]--- > RIP: 0010:kobject_get+0x30/0x150 lib/kobject.c:640 > Code: 53 e8 d4 7e c6 fd 4d 85 e4 0f 84 a2 00 00 00 e8 c6 7e c6 fd 49 8d 7c 24 > 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa > 83 e2 07 38 d0 7f 08 84 c0 0f 85 e7 00 00 00 > RSP: 0018:c9000772f240 EFLAGS: 00010203 > RAX: dc00 RBX: 85acfca0 RCX: c9000fc67000 > RDX: 0013 RSI: 83acadfa RDI: 009c > RBP: 0060 R08: 8880a8dfa4c0 R09: ed100a03f403 > R10: 8880501fa017 R11: ed100a03f402 R12: 0060 > R13: c9000772f3c0 R14: 88805d1ec4e8 R15: 88805d1ec580 > FS: 7f1ebed26700() GS:8880ae70() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 0073fad4 CR3: a86c4000 CR4: 001406e0 > DR0: DR1: DR2: > DR3: DR6: fffe0ff0 DR7: 0400 Looks like an IB/rdma issue, poke those developers please :)
Re: [RFC PATCH 0/8] Qualcomm Cloud AI 100 driver
On Tue, May 19, 2020 at 10:11:35PM -0700, Bjorn Andersson wrote: > On Tue 19 May 21:59 PDT 2020, Greg Kroah-Hartman wrote: > > > On Tue, May 19, 2020 at 10:41:15PM +0200, Daniel Vetter wrote: > > > > Ok, that's a decision you are going to have to push upward on, as we > > > > really can't take this without a working, open, userspace. > > > > > > Uh wut. > > > > > > So the merge criteria for drivers/accel (atm still drivers/misc but I > > > thought that was interim until more drivers showed up) isn't actually > > > "totally-not-a-gpu accel driver without open source userspace". > > > > > > Instead it's "totally-not-a-gpu accel driver without open source > > > userspace" _and_ you have to be best buddies with Greg. Or at least > > > not be on the naughty company list. Since for habanalabs all you > > > wanted is a few test cases to exercise the ioctls. Not the entire > > > userspace. > > > > Habanalabs now has their full library opensourced that their tools use > > directly, so that's not an argument anymore. > > > > My primary point here is the copyright owner of this code, because of > > that, I'm not going to objet to allowing this to be merged without open > > userspace code. > > > > So because it's copyright Linux Foundation you are going to accept it > without user space, after all? Huh, no, the exact opposite, sorry, drop the "not" in that above sentence. My bad. greg k-h
Re: [PATCH v1 2/6] bus: mhi: core: Mark device inactive soon after host issues a shutdown
Hi Bhaumik, Thank you for the patch! Yet something to improve: [auto build test ERROR on next-20200519] [cannot apply to linus/master v5.7-rc6 v5.7-rc5 v5.7-rc4 v5.7-rc6] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Bhaumik-Bhatt/Bug-fixes-and-bootup-and-shutdown-improvements/20200520-083400 base:fb57b1fabcb28f358901b2df90abd2b48abc1ca8 config: riscv-allyesconfig (attached as .config) compiler: riscv64-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot All errors (new ones prefixed by >>, old ones prefixed by <<): drivers/bus/mhi/core/main.c: In function 'mhi_intvec_threaded_handler': >> drivers/bus/mhi/core/main.c:397:8: error: implicit declaration of function >> 'mhi_is_active' [-Werror=implicit-function-declaration] 397 | if (!mhi_is_active(mhi_cntrl)) { |^ cc1: some warnings being treated as errors vim +/mhi_is_active +397 drivers/bus/mhi/core/main.c 371 372 irqreturn_t mhi_intvec_threaded_handler(int irq_number, void *priv) 373 { 374 struct mhi_controller *mhi_cntrl = priv; 375 struct device *dev = _cntrl->mhi_dev->dev; 376 enum mhi_state state = MHI_STATE_MAX; 377 enum mhi_pm_state pm_state = 0; 378 enum mhi_ee_type ee = 0; 379 bool handle_rddm = false; 380 381 write_lock_irq(_cntrl->pm_lock); 382 if (!MHI_REG_ACCESS_VALID(mhi_cntrl->pm_state)) { 383 write_unlock_irq(_cntrl->pm_lock); 384 goto exit_intvec; 385 } 386 387 state = mhi_get_mhi_state(mhi_cntrl); 388 ee = mhi_cntrl->ee; 389 mhi_cntrl->ee = mhi_get_exec_env(mhi_cntrl); 390 dev_dbg(dev, "local ee:%s device ee:%s dev_state:%s\n", 391 TO_MHI_EXEC_STR(mhi_cntrl->ee), TO_MHI_EXEC_STR(ee), 392 TO_MHI_STATE_STR(state)); 393 394 /* If device supports RDDM don't bother processing SYS error */ 395 if (mhi_cntrl->rddm_image) { 396 /* host may be performing a device power down already */ > 397 if (!mhi_is_active(mhi_cntrl)) { 398 write_unlock_irq(_cntrl->pm_lock); 399 goto exit_intvec; 400 } 401 402 if (mhi_cntrl->ee == MHI_EE_RDDM && mhi_cntrl->ee != ee) { 403 /* prevent clients from queueing any more packets */ 404 pm_state = mhi_tryset_pm_state(mhi_cntrl, 405 MHI_PM_SYS_ERR_DETECT); 406 if (pm_state == MHI_PM_SYS_ERR_DETECT) 407 handle_rddm = true; 408 } 409 410 write_unlock_irq(_cntrl->pm_lock); 411 412 if (handle_rddm) { 413 dev_err(dev, "RDDM event occurred!\n"); 414 mhi_cntrl->status_cb(mhi_cntrl, MHI_CB_EE_RDDM); 415 wake_up_all(_cntrl->state_event); 416 } 417 goto exit_intvec; 418 } 419 420 if (state == MHI_STATE_SYS_ERR) { 421 dev_dbg(dev, "System error detected\n"); 422 pm_state = mhi_tryset_pm_state(mhi_cntrl, 423 MHI_PM_SYS_ERR_DETECT); 424 } 425 426 write_unlock_irq(_cntrl->pm_lock); 427 428 if (pm_state == MHI_PM_SYS_ERR_DETECT) { 429 wake_up_all(_cntrl->state_event); 430 431 /* For fatal errors, we let controller decide next step */ 432 if (MHI_IN_PBL(ee)) 433 mhi_cntrl->status_cb(mhi_cntrl, MHI_CB_FATAL_ERROR); 434 else 435 mhi_pm_sys_err_handler(mhi_cntrl); 436 } 437 438 exit_intvec: 439 440 return IRQ_HANDLED; 441 } 442 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
[tip:x86/urgent] BUILD SUCCESS d7110a26e5905ec2fe3fc88bc6a538901accb72b
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git x86/urgent branch HEAD: d7110a26e5905ec2fe3fc88bc6a538901accb72b x86/mmiotrace: Use cpumask_available() for cpumask_var_t variables elapsed time: 486m configs tested: 98 configs skipped: 74 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig sparcallyesconfig mips allyesconfig m68k allyesconfig i386 allnoconfig i386defconfig i386 debian-10.3 i386 allyesconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig i386 randconfig-a006-20200519 i386 randconfig-a005-20200519 i386 randconfig-a001-20200519 i386 randconfig-a003-20200519 i386 randconfig-a004-20200519 i386 randconfig-a002-20200519 x86_64 randconfig-a003-20200519 x86_64 randconfig-a005-20200519 x86_64 randconfig-a004-20200519 x86_64 randconfig-a006-20200519 x86_64 randconfig-a002-20200519 x86_64 randconfig-a001-20200519 i386 randconfig-a012-20200519 i386 randconfig-a014-20200519 i386 randconfig-a016-20200519 i386 randconfig-a011-20200519 i386 randconfig-a015-20200519 i386 randconfig-a013-20200519 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig x86_64 defconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig um allmodconfig umallnoconfig um allyesconfig um defconfig x86_64 rhel x86_64 rhel-7.6 x86_64rhel-7.6-kselftests x86_64 rhel-7.2-clear x86_64lkp x86_64 fedora-25 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
[tip:perf/core] BUILD SUCCESS c50c75e9b87946499a62bffc021e95c87a1d57cd
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/core branch HEAD: c50c75e9b87946499a62bffc021e95c87a1d57cd perf/core: Replace zero-length array with flexible-array elapsed time: 486m configs tested: 98 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig sparcallyesconfig mips allyesconfig m68k allyesconfig i386 allnoconfig i386defconfig i386 debian-10.3 i386 allyesconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig i386 randconfig-a006-20200519 i386 randconfig-a005-20200519 i386 randconfig-a001-20200519 i386 randconfig-a003-20200519 i386 randconfig-a004-20200519 i386 randconfig-a002-20200519 x86_64 randconfig-a003-20200519 x86_64 randconfig-a005-20200519 x86_64 randconfig-a004-20200519 x86_64 randconfig-a006-20200519 x86_64 randconfig-a002-20200519 x86_64 randconfig-a001-20200519 i386 randconfig-a012-20200519 i386 randconfig-a014-20200519 i386 randconfig-a016-20200519 i386 randconfig-a011-20200519 i386 randconfig-a015-20200519 i386 randconfig-a013-20200519 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig x86_64 defconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig um allmodconfig umallnoconfig um allyesconfig um defconfig x86_64 rhel x86_64 rhel-7.6 x86_64rhel-7.6-kselftests x86_64 rhel-7.2-clear x86_64lkp x86_64 fedora-25 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
[tip:locking/core] BUILD SUCCESS db78538c75e49c09b002a2cd96a19ae0c39be771
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/core branch HEAD: db78538c75e49c09b002a2cd96a19ae0c39be771 locking/lockdep: Replace zero-length array with flexible-array elapsed time: 486m configs tested: 98 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig sparcallyesconfig mips allyesconfig m68k allyesconfig i386 allnoconfig i386 allyesconfig i386defconfig i386 debian-10.3 ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig i386 randconfig-a006-20200519 i386 randconfig-a005-20200519 i386 randconfig-a001-20200519 i386 randconfig-a003-20200519 i386 randconfig-a004-20200519 i386 randconfig-a002-20200519 x86_64 randconfig-a003-20200519 x86_64 randconfig-a005-20200519 x86_64 randconfig-a004-20200519 x86_64 randconfig-a006-20200519 x86_64 randconfig-a002-20200519 x86_64 randconfig-a001-20200519 i386 randconfig-a012-20200519 i386 randconfig-a014-20200519 i386 randconfig-a016-20200519 i386 randconfig-a011-20200519 i386 randconfig-a015-20200519 i386 randconfig-a013-20200519 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig x86_64 defconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig umallnoconfig um defconfig um allmodconfig um allyesconfig x86_64 rhel x86_64 rhel-7.6 x86_64rhel-7.6-kselftests x86_64 rhel-7.2-clear x86_64lkp x86_64 fedora-25 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
[tip:sched/core] BUILD SUCCESS d505b8af58912ae1e1a211fabc9995b19bd40828
allnoconfig i386 randconfig-a006-20200519 i386 randconfig-a005-20200519 i386 randconfig-a001-20200519 i386 randconfig-a003-20200519 i386 randconfig-a004-20200519 i386 randconfig-a002-20200519 x86_64 randconfig-a003-20200519 x86_64 randconfig-a005-20200519 x86_64 randconfig-a004-20200519 x86_64 randconfig-a006-20200519 x86_64 randconfig-a002-20200519 x86_64 randconfig-a001-20200519 i386 randconfig-a012-20200519 i386 randconfig-a014-20200519 i386 randconfig-a016-20200519 i386 randconfig-a011-20200519 i386 randconfig-a015-20200519 i386 randconfig-a013-20200519 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig x86_64 defconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig um allmodconfig umallnoconfig um allyesconfig um defconfig x86_64 rhel x86_64 rhel-7.6 x86_64rhel-7.6-kselftests x86_64 rhel-7.2-clear x86_64lkp x86_64 fedora-25 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
[tip:sched/urgent] BUILD SUCCESS 39f23ce07b9355d05a64ae303ce20d1c4b92b957
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/urgent branch HEAD: 39f23ce07b9355d05a64ae303ce20d1c4b92b957 sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list elapsed time: 486m configs tested: 98 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig sparcallyesconfig mips allyesconfig m68k allyesconfig i386 allnoconfig i386defconfig i386 debian-10.3 i386 allyesconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig i386 randconfig-a006-20200519 i386 randconfig-a005-20200519 i386 randconfig-a001-20200519 i386 randconfig-a003-20200519 i386 randconfig-a004-20200519 i386 randconfig-a002-20200519 x86_64 randconfig-a003-20200519 x86_64 randconfig-a005-20200519 x86_64 randconfig-a004-20200519 x86_64 randconfig-a006-20200519 x86_64 randconfig-a002-20200519 x86_64 randconfig-a001-20200519 i386 randconfig-a012-20200519 i386 randconfig-a014-20200519 i386 randconfig-a016-20200519 i386 randconfig-a011-20200519 i386 randconfig-a015-20200519 i386 randconfig-a013-20200519 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig x86_64 defconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig um allmodconfig umallnoconfig um allyesconfig um defconfig x86_64 rhel x86_64 rhel-7.6 x86_64rhel-7.6-kselftests x86_64 rhel-7.2-clear x86_64lkp x86_64 fedora-25 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v2] /dev/mem: Revoke mappings when a driver claims the region
On Tue, May 19, 2020 at 11:27:02AM -0700, Dan Williams wrote: > On Tue, May 19, 2020 at 5:11 AM Greg KH wrote: > > > > On Tue, May 19, 2020 at 12:03:06AM -0700, Dan Williams wrote: > > > Close the hole of holding a mapping over kernel driver takeover event of > > > a given address range. > > > > > > Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") > > > introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the > > > kernel against scenarios where a /dev/mem user tramples memory that a > > > kernel driver owns. However, this protection only prevents *new* read(), > > > write() and mmap() requests. Established mappings prior to the driver > > > calling request_mem_region() are left alone. > > > > > > Especially with persistent memory, and the core kernel metadata that is > > > stored there, there are plentiful scenarios for a /dev/mem user to > > > violate the expectations of the driver and cause amplified damage. > > > > > > Teach request_mem_region() to find and shoot down active /dev/mem > > > mappings that it believes it has successfully claimed for the exclusive > > > use of the driver. Effectively a driver call to request_mem_region() > > > becomes a hole-punch on the /dev/mem device. > > > > > > The typical usage of unmap_mapping_range() is part of > > > truncate_pagecache() to punch a hole in a file, but in this case the > > > implementation is only doing the "first half" of a hole punch. Namely it > > > is just evacuating current established mappings of the "hole", and it > > > relies on the fact that /dev/mem establishes mappings in terms of > > > absolute physical address offsets. Once existing mmap users are > > > invalidated they can attempt to re-establish the mapping, or attempt to > > > continue issuing read(2) / write(2) to the invalidated extent, but they > > > will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can > > > block those subsequent accesses. > > > > > > Cc: Arnd Bergmann > > > Cc: Ingo Molnar > > > Cc: Kees Cook > > > Cc: Russell King > > > Cc: Andrew Morton > > > Cc: Greg Kroah-Hartman > > > Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") > > > Signed-off-by: Dan Williams > > > --- > > > Changes since v1 [1]: > > > > > > - updated the changelog to describe the usage of unmap_mapping_range(). > > > No other logic changes: > > > > > > [1]: > > > http://lore.kernel.org/r/158662721802.1893045.12301414116114602646.st...@dwillia2-desk3.amr.corp.intel.com > > > > > > Greg, Andrew, > > > > > > I have a regression test for this case now. This was found by an > > > intermittent data corruption scenario on pmem from a test tool using > > > /dev/mem. > > > > Ick, why are test tools messing around in /dev/mem :) > > Yeah, I'm all for useful tools, just not at the expense of kernel integrity. > > > Anyway, this seems sane to me, want me to take it through my tree? > > Yes please, seems to belong with the driver core. Ok, will wait for a v3 to handle the issue that was just found in review. thanks, greg k-h
Re: [PATCH 09/15] device core: Add ability to handle multiple dma offsets
On Tue, May 19, 2020 at 04:34:07PM -0400, Jim Quinlan wrote: > diff --git a/include/linux/device.h b/include/linux/device.h > index ac8e37cd716a..6cd916860b5f 100644 > --- a/include/linux/device.h > +++ b/include/linux/device.h > @@ -493,6 +493,8 @@ struct dev_links_info { > * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller > * DMA limit than the device itself supports. > * @dma_pfn_offset: offset of DMA memory range relatively of RAM > + * @dma_map: Like dma_pfn_offset but used when there are multiple > + * pfn offsets for multiple dma-ranges. > * @dma_parms: A low level driver may set these to teach IOMMU code > about > * segment limitations. > * @dma_pools: Dma pools (if dma'ble device). > @@ -578,7 +580,12 @@ struct device { >allocations such descriptors. */ > u64 bus_dma_limit; /* upstream dma constraint */ > unsigned long dma_pfn_offset; > - > +#ifdef CONFIG_DMA_PFN_OFFSET_MAP > + const void *dma_offset_map; /* Like dma_pfn_offset, but for > + * the unlikely case of multiple > + * offsets. If non-null, dma_pfn_offset > + * will be 0. */ > +#endif > struct device_dma_parameters *dma_parms; > > struct list_headdma_pools; /* dma pools (if dma'ble) */ I'll defer to Christoph here, but I thought we were trying to get rid of stuff like this from struct device, not add new things to it for dma apis. And why is it a void *? thanks, greg k-h
Re: [PATCH v4 2/4] kasan: record and print the free track
> On Wed, May 20, 2020 at 6:03 AM Walter Wu wrote: > > > > > On Tue, May 19, 2020 at 4:25 AM Walter Wu > > > wrote: > > > > > > > > Move free track from slub alloc meta-data to slub free meta-data in > > > > order to make struct kasan_free_meta size is 16 bytes. It is a good > > > > size because it is the minimal redzone size and a good number of > > > > alignment. > > > > > > > > For free track in generic KASAN, we do the modification in struct > > > > kasan_alloc_meta and kasan_free_meta: > > > > - remove free track from kasan_alloc_meta. > > > > - add free track into kasan_free_meta. > > > > > > > > [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 > > > > > > > > Signed-off-by: Walter Wu > > > > Suggested-by: Dmitry Vyukov > > > > Cc: Andrey Ryabinin > > > > Cc: Dmitry Vyukov > > > > Cc: Alexander Potapenko > > > > --- > > > > mm/kasan/common.c | 22 ++ > > > > mm/kasan/generic.c | 18 ++ > > > > mm/kasan/kasan.h | 7 +++ > > > > mm/kasan/report.c | 20 > > > > mm/kasan/tags.c| 37 + > > > > 5 files changed, 64 insertions(+), 40 deletions(-) > > > > > > > > diff --git a/mm/kasan/common.c b/mm/kasan/common.c > > > > index 8bc618289bb1..47b53912f322 100644 > > > > --- a/mm/kasan/common.c > > > > +++ b/mm/kasan/common.c > > > > @@ -51,7 +51,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags) > > > > return stack_depot_save(entries, nr_entries, flags); > > > > } > > > > > > > > -static inline void set_track(struct kasan_track *track, gfp_t flags) > > > > +void kasan_set_track(struct kasan_track *track, gfp_t flags) > > > > { > > > > track->pid = current->pid; > > > > track->stack = kasan_save_stack(flags); > > > > @@ -299,24 +299,6 @@ struct kasan_free_meta *get_free_info(struct > > > > kmem_cache *cache, > > > > return (void *)object + cache->kasan_info.free_meta_offset; > > > > } > > > > > > > > - > > > > -static void kasan_set_free_info(struct kmem_cache *cache, > > > > - void *object, u8 tag) > > > > -{ > > > > - struct kasan_alloc_meta *alloc_meta; > > > > - u8 idx = 0; > > > > - > > > > - alloc_meta = get_alloc_info(cache, object); > > > > - > > > > -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY > > > > - idx = alloc_meta->free_track_idx; > > > > - alloc_meta->free_pointer_tag[idx] = tag; > > > > - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; > > > > -#endif > > > > - > > > > - set_track(_meta->free_track[idx], GFP_NOWAIT); > > > > -} > > > > - > > > > void kasan_poison_slab(struct page *page) > > > > { > > > > unsigned long i; > > > > @@ -492,7 +474,7 @@ static void *__kasan_kmalloc(struct kmem_cache > > > > *cache, const void *object, > > > > KASAN_KMALLOC_REDZONE); > > > > > > > > if (cache->flags & SLAB_KASAN) > > > > - set_track(_alloc_info(cache, object)->alloc_track, > > > > flags); > > > > + kasan_set_track(_alloc_info(cache, > > > > object)->alloc_track, flags); > > > > > > > > return set_tag(object, tag); > > > > } > > > > diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c > > > > index 3372bdcaf92a..763d8a13e0ac 100644 > > > > --- a/mm/kasan/generic.c > > > > +++ b/mm/kasan/generic.c > > > > @@ -344,3 +344,21 @@ void kasan_record_aux_stack(void *addr) > > > > alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; > > > > alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); > > > > } > > > > + > > > > +void kasan_set_free_info(struct kmem_cache *cache, > > > > + void *object, u8 tag) > > > > +{ > > > > + struct kasan_free_meta *free_meta; > > > > + > > > > + free_meta = get_free_info(cache, object); > > > > + kasan_set_track(_meta->free_track, GFP_NOWAIT); > > > > +} > > > > + > > > > +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, > > > > + void *object, u8 tag) > > > > +{ > > > > + struct kasan_free_meta *free_meta; > > > > + > > > > + free_meta = get_free_info(cache, object); > > > > + return _meta->free_track; > > > > +} > > > > diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h > > > > index a7391bc83070..ad897ec36545 100644 > > > > --- a/mm/kasan/kasan.h > > > > +++ b/mm/kasan/kasan.h > > > > @@ -127,6 +127,9 @@ struct kasan_free_meta { > > > > * Otherwise it might be used for the allocator freelist. > > > > */ > > > > struct qlist_node quarantine_link; > > > > +#ifdef CONFIG_KASAN_GENERIC > > > > + struct kasan_track free_track; > > > > +#endif > > > > }; > > > > > > > > struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, > > > > @@ -168,6 +171,10 @@ void kasan_report_invalid_free(void *object, > > > > unsigned long ip); > > > > struct page *kasan_addr_to_page(const void *addr); > > > > > > > >
Re: [PATCH 5.6 000/192] 5.6.14-rc2 review
On Tue, May 19, 2020 at 01:37:20PM -0600, shuah wrote: > On 5/18/20 11:47 PM, Greg Kroah-Hartman wrote: > > This is the start of the stable review cycle for the 5.6.14 release. > > There are 192 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Thu, 21 May 2020 05:45:41 +. > > Anything received after that time might be too late. > > > > The whole patch series can be found in one patch at: > > > > https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.6.14-rc2.gz > > or in the git tree and branch at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > linux-5.6.y > > and the diffstat can be found below. > > > > thanks, > > > > greg k-h > > > > Compiled and booted on my test system. No dmesg regressions. Thanks for testing all of these and letting me know. greg k-h
Re: [PATCH 5.6 000/192] 5.6.14-rc2 review
On Tue, May 19, 2020 at 09:30:22AM -0700, Guenter Roeck wrote: > On 5/18/20 10:47 PM, Greg Kroah-Hartman wrote: > > This is the start of the stable review cycle for the 5.6.14 release. > > There are 192 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Thu, 21 May 2020 05:45:41 +. > > Anything received after that time might be too late. > > > > Build results: > total: 155 pass: 155 fail: 0 > Qemu test results: > total: 431 pass: 431 fail: 0 Great, thanks for testing all of these and letting me know. greg k-h
Re: [PATCH 06/12] xen-blkfront: add callbacks for PM suspend and hibernation
Hi Anchal, Thank you for the patch! Yet something to improve: [auto build test ERROR on linus/master] [also build test ERROR on v5.7-rc6] [cannot apply to xen-tip/linux-next tip/irq/core tip/auto-latest next-20200519] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Anchal-Agarwal/Fix-PM-hibernation-in-Xen-guests/20200520-073211 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 03fb3acae4be8a6b680ffedb220a8b6c07260b40 config: x86_64-rhel (attached as .config) compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot All error/warnings (new ones prefixed by >>, old ones prefixed by <<): drivers/block/xen-blkfront.c: In function 'blkfront_freeze': >> drivers/block/xen-blkfront.c:2699:30: warning: missing terminating " >> character xenbus_dev_error(dev, err, "Hibernation Failed. ^ >> drivers/block/xen-blkfront.c:2699:30: error: missing terminating " character xenbus_dev_error(dev, err, "Hibernation Failed. ^~~~ >> drivers/block/xen-blkfront.c:2700:4: error: 'The' undeclared (first use in >> this function) The ring is still busy"); ^~~ drivers/block/xen-blkfront.c:2700:4: note: each undeclared identifier is reported only once for each function it appears in >> drivers/block/xen-blkfront.c:2700:8: error: expected ')' before 'ring' The ring is still busy"); ^~~~ drivers/block/xen-blkfront.c:2700:26: warning: missing terminating " character The ring is still busy"); ^ drivers/block/xen-blkfront.c:2700:26: error: missing terminating " character The ring is still busy"); ^~~ >> drivers/block/xen-blkfront.c:2704:2: error: expected ';' before '}' token } ^ vim +2699 drivers/block/xen-blkfront.c 2672 2673 static int blkfront_freeze(struct xenbus_device *dev) 2674 { 2675 unsigned int i; 2676 struct blkfront_info *info = dev_get_drvdata(>dev); 2677 struct blkfront_ring_info *rinfo; 2678 /* This would be reasonable timeout as used in xenbus_dev_shutdown() */ 2679 unsigned int timeout = 5 * HZ; 2680 unsigned long flags; 2681 int err = 0; 2682 2683 info->connected = BLKIF_STATE_FREEZING; 2684 2685 blk_mq_freeze_queue(info->rq); 2686 blk_mq_quiesce_queue(info->rq); 2687 2688 for_each_rinfo(info, rinfo, i) { 2689 /* No more gnttab callback work. */ 2690 gnttab_cancel_free_callback(>callback); 2691 /* Flush gnttab callback work. Must be done with no locks held. */ 2692 flush_work(>work); 2693 } 2694 2695 for_each_rinfo(info, rinfo, i) { 2696 spin_lock_irqsave(>ring_lock, flags); 2697 if (RING_FULL(>ring) 2698 || RING_HAS_UNCONSUMED_RESPONSES(>ring)) { > 2699 xenbus_dev_error(dev, err, "Hibernation Failed. > 2700 The ring is still busy"); 2701 info->connected = BLKIF_STATE_CONNECTED; 2702 spin_unlock_irqrestore(>ring_lock, flags); 2703 return -EBUSY; > 2704 } 2705 spin_unlock_irqrestore(>ring_lock, flags); 2706 } 2707 /* Kick the backend to disconnect */ 2708 xenbus_switch_state(dev, XenbusStateClosing); 2709 2710 /* 2711 * We don't want to move forward before the frontend is diconnected 2712 * from the backend cleanly. 2713 */ 2714 timeout = wait_for_completion_timeout(>wait_backend_disconnected, 2715timeout); 2716 if (!timeout) { 2717 err = -EBUSY; 2718 xenbus_dev_error(dev, err, "Freezing timed out;" 2719 "the device may become inconsistent state"); 2720 } 2721 2722 return err; 2723 } 2724 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH v2 12/15] ath10k: use new module_firmware_crashed()
Hi all, Since I have been involved quite a bit in the firmware debugging features in iwlwifi, I think I can give a few insights here. But before this, we need to understand that there are several sources of issues: 1) the firmware may crash but the bus is still alive, you can still use the bus to get the crash data 2) the bus is dead, when that happens, the firmware might even be in a good condition, but since the bus is dead, you stop getting any information about the firmware, and then, at some point, you get to the conclusion that the firmware is dead. You can't get the crash data that resides on the other side of the bus (you may have gathered data in the DRAM directly, but that's a different thing), and you don't have much recovery to do besides re-starting the PCI enumeration. At Intel, we have seen both unfortunately. The bus issues are the ones that are trickier obviously. Trickier to detect (because you just get garbage from any request you issue on the bus), and trickier to handle. One can argue that the kernel should *not* handle those and let this in userspace hands. I guess it all depends on what component you ship to your customer and what you customer asks from you :). > > Hi Luis, > > On Tue, May 19, 2020 at 7:02 AM Luis Chamberlain wrote: > > On Mon, May 18, 2020 at 06:23:33PM -0700, Brian Norris wrote: > > > On Sat, May 16, 2020 at 6:51 AM Johannes Berg > > > wrote: > > > > In addition, look what we have in iwl_trans_pcie_removal_wk(). If we > > > > detect that the device is really wedged enough that the only way we can > > > > still try to recover is by completely unbinding the driver from it, then > > > > we give userspace a uevent for that. I don't remember exactly how and > > > > where that gets used (ChromeOS) though, but it'd be nice to have that > > > > sort of thing as part of the infrastructure, in a sort of two-level > > > > notification? > > > > > > > > > We use this on certain devices where we know the underlying hardware > > > has design issues that may lead to device failure > > > > Ah, after reading below I see you meant for iwlwifi. > > Sorry, I was replying to Johannes, who I believe had his "we"="Intel" > hat (as iwlwifi maintainer) on, and was pointing at > iwl_trans_pcie_removal_wk(). > This pcie_removal thing is for the bus dead thing. My 2) above. > > If userspace can indeed grow to support this, that would be fantastic. > > Well, Chrome OS tailors its user space a bit more to the hardware (and > kernel/drivers in use) than the average distro might. We already do > this (for some values of "this") today. Is that "fantastic" to you? :D I guess it can be fantastic if other vendors also suffer from this. Or maybe that could be done as part of the PCI bus driver inside the kernel? > > > > -- then when we see > > > this sort of unrecoverable "firmware-death", we remove the > > > device[*]+driver, force-reset the PCI device (SBR), and try to > > > reload/reattach the driver. This all happens by way of a udev rule. > > > > So you've sprikled your own udev event here as part of your kernel delta? > > No kernel delta -- the event is there already: > iwl_trans_pcie_removal_wk() > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/wireless/intel/iwlwifi/pcie/trans.c?h=v5.6#n2027 > > And you can see our udev rules and scripts, in all their ugly details > here, if you really care: > https://chromium.googlesource.com/chromiumos/overlays/chromiumos-overlay/+/master/net-wireless/iwlwifi_rescan/files/ > > > > We > > > also log this sort of stuff (and metrics around it) for bug reports > > > and health statistics, since we really hope to not see this happen > > > often. > > > > Assuming perfection is ideal but silly. So, what infrastructure do you > > use for this sort of issue? > > We don't yet log firmware crashes generally, but for all our current > crash reports (including WARN()), they go through this: > https://chromium.googlesource.com/chromiumos/platform2/+/master/crash-reporter/README.md > > For example, look for "cut here" in: > https://chromium.googlesource.com/chromiumos/platform2/+/master/crash-reporter/anomaly_detector.cc > > For other specific metrics (like counting "EVENT=INACCESSIBLE"), we > use the Chrome UMA system: > https://chromium.googlesource.com/chromiumos/platform2/+/master/metrics/README.md > > I don't imagine the "infrastructure" side of any of that would be > useful to you, but maybe the client-side gathering can at least show > you what we do. > > > > [*] "We" (user space) don't actually do this...it happens via the > > > 'remove_when_gone' module parameter abomination found in iwlwifi. > > > > BTW is this likely a place on iwlwifi where the firmware likely crashed? > > iwl_trans_pcie_removal_wk() is triggered because HW accesses timed out > in a way that is likely due to a dead PCIe endpoint. It's not directly > a firmware crash, although there may be firmware crashes reported > around the same time.
Re: [PATCH] perf evsel: Get group fd from CPU0 for system wide event
Hi Jiri, On 5/18/2020 11:28 AM, Jin, Yao wrote: Hi Jiri, On 5/15/2020 4:33 PM, Jiri Olsa wrote: On Fri, May 15, 2020 at 02:04:57PM +0800, Jin, Yao wrote: SNIP I think I get the root cause. That should be a serious bug in get_group_fd, access violation! For a group mixed with system-wide event and per-core event and the group leader is system-wide event, access violation will happen. perf_evsel__alloc_fd allocates one FD member for system-wide event (only FD(evsel, 0, 0) is valid). But for per core event, perf_evsel__alloc_fd allocates N FD members (N = ncpus). For example, for ncpus is 8, FD(evsel, 0, 0) to FD(evsel, 7, 0) are valid. get_group_fd(struct evsel *evsel, int cpu, int thread) { struct evsel *leader = evsel->leader; fd = FD(leader, cpu, thread); /* access violation may happen here */ } If leader is system-wide event, only the FD(leader, 0, 0) is valid. When get_group_fd accesses FD(leader, 1, 0), access violation happens. My fix is: diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 28683b0eb738..db05b8a1e1a8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1440,6 +1440,9 @@ static int get_group_fd(struct evsel *evsel, int cpu, int thread) if (evsel__is_group_leader(evsel)) return -1; + if (leader->core.system_wide && !evsel->core.system_wide) + return -2; so this effectively stops grouping system_wide events with others, and I think it's correct, how about events that differ in cpumask? My understanding for the events that differ in cpumaks is, if the leader's cpumask is not fully matched with the evsel's cpumask then we stop the grouping. Is this understanding correct? I have done some tests and get some conclusions: 1. If the group is mixed with core and uncore events, the system_wide checking can distinguish them. 2. If the group is mixed with core and uncore events and "-a" is specified, the system_wide for core event is also false. So system_wide checking can distinguish them too 3. In my test, the issue only occurs when we collect the metric which is mixed with uncore event and core event, so maybe checking the system_wide is OK. should we perhaps ensure this before we call open? go throught all groups and check they are on the same cpus? The issue doesn't happen at most of the time (only for the metric consisting of uncore event and core event), so fallback to stop grouping if call open is failed looks reasonable. Thanks Jin Yao thanks, jirka + /* * Leader must be already processed/open, * if not it's a bug. @@ -1665,6 +1668,11 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, pid = perf_thread_map__pid(threads, thread); group_fd = get_group_fd(evsel, cpu, thread); + if (group_fd == -2) { + errno = EINVAL; + err = -EINVAL; + goto out_close; + } retry_open: test_attr__ready(); It enables the perf_evlist__reset_weak_group. And in the second_pass (in __run_perf_stat), the events will be opened successfully. I have tested OK for this fix on cascadelakex. Thanks Jin Yao Is this fix OK? Another thing is, do you think if we need to rename "evsel->core.system_wide" to "evsel->core.has_cpumask". The "system_wide" may misleading. evsel->core.system_wide = pmu ? pmu->is_uncore : false; "pmu->is_uncore" is true if PMU has a "cpumask". But it's not just uncore PMU which has cpumask. Some other PMUs, e.g. cstate_pkg, also have cpumask. So for this case, "has_cpumask" should be better. But I'm not sure if the change is OK for other case, e.g. PT, which also uses "evsel->core.system_wide". Thanks Jin Yao
Re: [PATCH v2] drm/exynos: Remove dev_err() on platform_get_irq() failure
Hi Tamseel, Same patch[1] has been merged. So could you re-post this patch after rebasing it on top of exynos-drm-next branch? After rebase, only g2d part would be valid. Thanks, Inki Dae [1] https://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git/commit/?h=exynos-drm-next=fdd79b0db1899f915f489e744a06846284fa3f1e 20. 5. 19. 오후 7:49에 Tamseel Shams 이(가) 쓴 글: > platform_get_irq() will call dev_err() itself on failure, > so there is no need for the driver to also do this. > This is detected by coccinelle. > > Also removing unnecessary curly braces around if () statement. > > Signed-off-by: Tamseel Shams > --- > Fixed review comment by j...@perches.com > > drivers/gpu/drm/exynos/exynos_drm_dsi.c | 4 +--- > drivers/gpu/drm/exynos/exynos_drm_g2d.c | 1 - > drivers/gpu/drm/exynos/exynos_drm_rotator.c | 4 +--- > drivers/gpu/drm/exynos/exynos_drm_scaler.c | 4 +--- > 4 files changed, 3 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c > b/drivers/gpu/drm/exynos/exynos_drm_dsi.c > index 902938d2568f..958e2c6a6702 100644 > --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c > +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c > @@ -1809,10 +1809,8 @@ static int exynos_dsi_probe(struct platform_device > *pdev) > } > > dsi->irq = platform_get_irq(pdev, 0); > - if (dsi->irq < 0) { > - dev_err(dev, "failed to request dsi irq resource\n"); > + if (dsi->irq < 0) > return dsi->irq; > - } > > irq_set_status_flags(dsi->irq, IRQ_NOAUTOEN); > ret = devm_request_threaded_irq(dev, dsi->irq, NULL, > diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c > b/drivers/gpu/drm/exynos/exynos_drm_g2d.c > index fcee33a43aca..03be31427181 100644 > --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c > +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c > @@ -1498,7 +1498,6 @@ static int g2d_probe(struct platform_device *pdev) > > g2d->irq = platform_get_irq(pdev, 0); > if (g2d->irq < 0) { > - dev_err(dev, "failed to get irq\n"); > ret = g2d->irq; > goto err_put_clk; > } > diff --git a/drivers/gpu/drm/exynos/exynos_drm_rotator.c > b/drivers/gpu/drm/exynos/exynos_drm_rotator.c > index dafa87b82052..2d94afba031e 100644 > --- a/drivers/gpu/drm/exynos/exynos_drm_rotator.c > +++ b/drivers/gpu/drm/exynos/exynos_drm_rotator.c > @@ -293,10 +293,8 @@ static int rotator_probe(struct platform_device *pdev) > return PTR_ERR(rot->regs); > > irq = platform_get_irq(pdev, 0); > - if (irq < 0) { > - dev_err(dev, "failed to get irq\n"); > + if (irq < 0) > return irq; > - } > > ret = devm_request_irq(dev, irq, rotator_irq_handler, 0, dev_name(dev), > rot); > diff --git a/drivers/gpu/drm/exynos/exynos_drm_scaler.c > b/drivers/gpu/drm/exynos/exynos_drm_scaler.c > index 93c43c8d914e..ce1857138f89 100644 > --- a/drivers/gpu/drm/exynos/exynos_drm_scaler.c > +++ b/drivers/gpu/drm/exynos/exynos_drm_scaler.c > @@ -502,10 +502,8 @@ static int scaler_probe(struct platform_device *pdev) > return PTR_ERR(scaler->regs); > > irq = platform_get_irq(pdev, 0); > - if (irq < 0) { > - dev_err(dev, "failed to get irq\n"); > + if (irq < 0) > return irq; > - } > > ret = devm_request_threaded_irq(dev, irq, NULL, scaler_irq_handler, > IRQF_ONESHOT, "drm_scaler", scaler); >
Re: [RFC PATCH 0/8] Qualcomm Cloud AI 100 driver
On Tue, May 19, 2020 at 12:26:01PM -0600, Jeffrey Hugo wrote: > On 5/19/2020 12:12 PM, Greg Kroah-Hartman wrote: > > > > Especially given the copyright owner of this code, that would be just > > > > crazy and foolish to not have open userspace code as well. Firmware > > > > would also be wonderful as well, go poke your lawyers about derivative > > > > work issues and the like for fun conversations :) > > > > > > Those are the kind of conversations I try to avoid :) > > > > Sounds like you are going to now have to have them, have fun! > > Honestly, I fail to see where you think there is a derivative work, so, I'm > not really sure what discussions I need to revisit with our lawyers. Given that we are not lawyers, why don't we leave those types of discussions up to the lawyers, and not depend on people like me and you for that? :) If your lawyers think that the code division is fine as-is, that's great, I'd be glad to review it if they add their signed-off-by: on it verifying that the api divide is approved by them. thanks! greg k-h
[PATCH v6 11/12] mmap locking API: convert mmap_sem API comments
Convert comments that reference old mmap_sem APIs to reference corresponding new mmap locking APIs instead. Signed-off-by: Michel Lespinasse --- Documentation/vm/hmm.rst | 6 +++--- arch/alpha/mm/fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/riscv/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 2 +- arch/sparc/mm/fault_64.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/android/binder_alloc.c | 4 ++-- fs/hugetlbfs/inode.c | 2 +- fs/userfaultfd.c | 2 +- mm/filemap.c | 2 +- mm/gup.c | 12 ++-- mm/huge_memory.c | 4 ++-- mm/khugepaged.c| 2 +- mm/ksm.c | 2 +- mm/memory.c| 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 4 ++-- mm/mmap.c | 2 +- mm/oom_kill.c | 8 net/ipv4/tcp.c | 2 +- 29 files changed, 43 insertions(+), 43 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 4e3e9362afeb..046817505033 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -194,15 +194,15 @@ The usage pattern is:: again: range.notifier_seq = mmu_interval_read_begin(_sub); - down_read(>mmap_sem); + mmap_read_lock(mm); ret = hmm_range_fault(); if (ret) { - up_read(>mmap_sem); + mmap_read_unlock(mm); if (ret == -EBUSY) goto again; return ret; } - up_read(>mmap_sem); + mmap_read_unlock(mm); take_lock(driver->update); if (mmu_interval_read_retry(, range.notifier_seq) { diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 36efa778ee1a..c2303a8c2b9f 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -171,7 +171,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -/* No need to up_read(>mmap_sem) as we would +/* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 9b95050c2048..0f788992608a 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -169,7 +169,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -/* No need to up_read(>mmap_sem) as we would +/* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 650acab0d77d..a94a814ad6ad 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -165,7 +165,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, flags |= FAULT_FLAG_TRIED; /* -* No need to up_read(>mmap_sem) as we would +* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 9d7c423dea1d..ebf1ac50b291 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -239,7 +239,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, flags |= FAULT_FLAG_TRIED; /* -* No need to up_read(>mmap_sem) as we would +* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 9ef2dd39111e..01b168a90434 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -181,7 +181,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, flags |= FAULT_FLAG_TRIED; /* -* No need to up_read(>mmap_sem) as we would +
[PATCH v6 09/12] mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()
Add new APIs to assert that mmap_sem is held. Using this instead of rwsem_is_locked and lockdep_assert_held[_write] makes the assertions more tolerant of future changes to the lock type. Signed-off-by: Michel Lespinasse --- arch/x86/events/core.c| 2 +- fs/userfaultfd.c | 6 +++--- include/linux/mmap_lock.h | 14 ++ mm/gup.c | 2 +- mm/hmm.c | 2 +- mm/memory.c | 2 +- mm/mmu_notifier.c | 6 +++--- mm/pagewalk.c | 6 +++--- mm/util.c | 2 +- 9 files changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..66559ac4f89e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2182,7 +2182,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_write(>mmap_sem); + mmap_assert_write_locked(mm); if (atomic_inc_return(>context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 9c645eee1a59..12b492409040 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -234,7 +234,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, pte_t *ptep, pte; bool ret = true; - VM_BUG_ON(!rwsem_is_locked(>mmap_sem)); + mmap_assert_locked(mm); ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); @@ -286,7 +286,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t *pte; bool ret = true; - VM_BUG_ON(!rwsem_is_locked(>mmap_sem)); + mmap_assert_locked(mm); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -405,7 +405,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * Coredumping runs without mmap_sem so we can only check that * the mmap_sem is held, if PF_DUMPCORE was not set. */ - WARN_ON_ONCE(!rwsem_is_locked(>mmap_sem)); + mmap_assert_locked(mm); ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index acac1bf5ecd2..43ef914e6468 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,6 +1,8 @@ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +#include + #define MMAP_LOCK_INITIALIZER(name) \ .mmap_sem = __RWSEM_INITIALIZER((name).mmap_sem), @@ -73,4 +75,16 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) up_read_non_owner(>mmap_sem); } +static inline void mmap_assert_locked(struct mm_struct *mm) +{ + lockdep_assert_held(>mmap_sem); + VM_BUG_ON_MM(!rwsem_is_locked(>mmap_sem), mm); +} + +static inline void mmap_assert_write_locked(struct mm_struct *mm) +{ + lockdep_assert_held_write(>mmap_sem); + VM_BUG_ON_MM(!rwsem_is_locked(>mmap_sem), mm); +} + #endif /* _LINUX_MMAP_LOCK_H */ diff --git a/mm/gup.c b/mm/gup.c index 631285295950..c1c0b37d0e8f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1405,7 +1405,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(>mmap_sem), mm); + mmap_assert_locked(mm); gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) diff --git a/mm/hmm.c b/mm/hmm.c index 280585833adf..660a4bcf932a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -581,7 +581,7 @@ long hmm_range_fault(struct hmm_range *range) struct mm_struct *mm = range->notifier->mm; int ret; - lockdep_assert_held(>mmap_sem); + mmap_assert_locked(mm); do { /* If range is no longer valid force retry. */ diff --git a/mm/memory.c b/mm/memory.c index e6dd3309c5a3..20f98ea8968e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1214,7 +1214,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { - VM_BUG_ON_VMA(!rwsem_is_locked(>mm->mmap_sem), vma); + mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index cfd0a03bf5cc..24eb9d1ed0a7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -609,7
[PATCH v6 02/12] MMU notifier: use the new mmap locking API
This use is converted manually ahead of the next patch in the series, as it requires including a new header which the automated conversion would miss. Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Davidlohr Bueso Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- include/linux/mmu_notifier.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 736f6918335e..2f462710a1a4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -277,9 +278,9 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; - down_write(>mmap_sem); + mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); - up_write(>mmap_sem); + mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 10/12] mmap locking API: rename mmap_sem to mmap_lock
Rename the mmap_sem field to mmap_lock. Any new uses of this lock should now go through the new mmap locking api. The mmap_lock is still implemented as a rwsem, though this could change in the future. Signed-off-by: Michel Lespinasse Reviewed-by: Vlastimil Babka --- arch/ia64/mm/fault.c | 4 +-- arch/x86/mm/fault.c | 2 +- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- include/linux/mm_types.h | 2 +- include/linux/mmap_lock.h | 38 +-- mm/memory.c | 2 +- mm/mmap.c | 4 +-- mm/mmu_notifier.c | 2 +- 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 693f00b117e1..9b95050c2048 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -70,8 +70,8 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re mask = isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); - /* mmap_sem is performance critical */ - prefetchw(>mmap_sem); + /* mmap_lock is performance critical */ + prefetchw(>mmap_lock); /* * If we're in an interrupt or have no user context, we must not take the fault.. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 181f66b9049f..35f530f9dfc0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1522,7 +1522,7 @@ dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - prefetchw(>mm->mmap_sem); + prefetchw(>mm->mmap_lock); trace_page_fault_entries(regs, hw_error_code, address); if (unlikely(kmmio_fault(regs, address))) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index dc9ef302f517..701f3995f621 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -661,7 +661,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) struct etnaviv_gem_userptr *userptr = _obj->userptr; int ret, pinned = 0, npages = etnaviv_obj->base.size >> PAGE_SHIFT; - might_lock_read(>mm->mmap_sem); + might_lock_read(>mm->mmap_lock); if (userptr->mm != current->mm) return -EPERM; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4aba6c0c2ba8..d13b90399c16 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -436,7 +436,7 @@ struct mm_struct { spinlock_t page_table_lock; /* Protects page tables and some * counters */ - struct rw_semaphore mmap_sem; + struct rw_semaphore mmap_lock; struct list_head mmlist; /* List of maybe swapped mm's. These * are globally strung together off diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 43ef914e6468..b5bd86778cca 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -4,67 +4,67 @@ #include #define MMAP_LOCK_INITIALIZER(name) \ - .mmap_sem = __RWSEM_INITIALIZER((name).mmap_sem), + .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), static inline void mmap_init_lock(struct mm_struct *mm) { - init_rwsem(>mmap_sem); + init_rwsem(>mmap_lock); } static inline void mmap_write_lock(struct mm_struct *mm) { - down_write(>mmap_sem); + down_write(>mmap_lock); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { - down_write_nested(>mmap_sem, subclass); + down_write_nested(>mmap_lock, subclass); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { - return down_write_killable(>mmap_sem); + return down_write_killable(>mmap_lock); } static inline bool mmap_write_trylock(struct mm_struct *mm) { - return down_write_trylock(>mmap_sem) != 0; + return down_write_trylock(>mmap_lock) != 0; } static inline void mmap_write_unlock(struct mm_struct *mm) { - up_write(>mmap_sem); + up_write(>mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { - downgrade_write(>mmap_sem); + downgrade_write(>mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { - down_read(>mmap_sem); + down_read(>mmap_lock); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { - return down_read_killable(>mmap_sem); + return down_read_killable(>mmap_lock); } static inline bool mmap_read_trylock(struct mm_struct *mm) { - return down_read_trylock(>mmap_sem) != 0; + return down_read_trylock(>mmap_lock) != 0; } static
[PATCH v6 08/12] mmap locking API: add MMAP_LOCK_INITIALIZER
Define a new initializer for the mmap locking api. Initially this just evaluates to __RWSEM_INITIALIZER as the API is defined as wrappers around rwsem. Signed-off-by: Michel Lespinasse Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- arch/x86/kernel/tboot.c| 2 +- drivers/firmware/efi/efi.c | 2 +- include/linux/mmap_lock.h | 3 +++ mm/init-mm.c | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..885058325c20 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -90,7 +90,7 @@ static struct mm_struct tboot_mm = { .pgd= swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), }; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 911a2bd0f6b7..916313ec8acb 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -54,7 +54,7 @@ struct mm_struct efi_mm = { .mm_rb = RB_ROOT, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(efi_mm) .page_table_lock= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d1826ce42f00..acac1bf5ecd2 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,6 +1,9 @@ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +#define MMAP_LOCK_INITIALIZER(name) \ + .mmap_sem = __RWSEM_INITIALIZER((name).mmap_sem), + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(>mmap_sem); diff --git a/mm/init-mm.c b/mm/init-mm.c index 19603302a77f..fe9c03d8e07b 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,7 +31,7 @@ struct mm_struct init_mm = { .pgd= swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 07/12] mmap locking API: add mmap_read_trylock_non_owner()
Add a couple APIs used by kernel/bpf/stackmap.c only: - mmap_read_trylock_non_owner() - mmap_read_unlock_non_owner() (may be called from a work queue). It's still not ideal that bpf/stackmap subverts the lock ownership in this way. Thanks to Peter Zijlstra for suggesting this API as the least-ugly way of addressing this in the short term. Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Vlastimil Babka --- include/linux/mmap_lock.h | 14 ++ kernel/bpf/stackmap.c | 17 + 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a757cb30ae77..d1826ce42f00 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -56,4 +56,18 @@ static inline void mmap_read_unlock(struct mm_struct *mm) up_read(>mmap_sem); } +static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) +{ + if (down_read_trylock(>mmap_sem)) { + rwsem_release(>mmap_sem.dep_map, _RET_IP_); + return true; + } + return false; +} + +static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) +{ + up_read_non_owner(>mmap_sem); +} + #endif /* _LINUX_MMAP_LOCK_H */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 11d41f0c7005..998968659892 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -33,7 +33,7 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) @@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry) return; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - mmap_read_trylock(current->mm) == 0) { + !mmap_read_trylock_non_owner(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock(current->mm); + mmap_read_unlock_non_owner(current->mm); } else { - work->sem = >mm->mmap_sem; + work->mm = current->mm; irq_work_queue(>irq_work); - /* -* The irq_work will release the mmap_sem with -* up_read_non_owner(). The rwsem_release() is called -* here to release the lock from lockdep's perspective. -*/ - rwsem_release(>mm->mmap_sem.dep_map, _RET_IP_); } } -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 05/12] mmap locking API: convert mmap_sem call sites missed by coccinelle
Convert the last few remaining mmap_sem rwsem calls to use the new mmap locking API. These were missed by coccinelle for some reason (I think coccinelle does not support some of the preprocessor constructs in these files ?) Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- arch/mips/mm/fault.c | 10 +- arch/riscv/mm/pageattr.c | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 8 fs/proc/base.c | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index f8d62cd83b36..9ef2dd39111e 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -97,7 +97,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, if (user_mode(regs)) flags |= FAULT_FLAG_USER; retry: - down_read(>mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); if (!vma) goto bad_area; @@ -190,7 +190,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, } } - up_read(>mmap_sem); + mmap_read_unlock(mm); return; /* @@ -198,7 +198,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, * Fix it, but check if it's kernel or user first.. */ bad_area: - up_read(>mmap_sem); + mmap_read_unlock(mm); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ @@ -250,14 +250,14 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ - up_read(>mmap_sem); + mmap_read_unlock(mm); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: - up_read(>mmap_sem); + mmap_read_unlock(mm); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 728759eb530a..b9072c043222 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -117,10 +117,10 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, if (!numpages) return 0; - down_read(_mm.mmap_sem); + mmap_read_lock(_mm); ret = walk_page_range_novma(_mm, start, end, _ops, NULL, ); - up_read(_mm.mmap_sem); + mmap_read_unlock(_mm); flush_tlb_kernel_range(start, end); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 9bdf9b7d9a96..40e5bb67cc09 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -165,22 +165,22 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long pfn; unsigned long paddr; - down_read(>mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - up_read(>mm->mmap_sem); + mmap_read_unlock(current->mm); return -EFAULT; } pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; paddr = pfn << PAGE_SHIFT; table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); if (!table) { - up_read(>mm->mmap_sem); + mmap_read_unlock(current->mm); return -EFAULT; } ret = CMPXCHG([index], orig_pte, new_pte); memunmap(table); - up_read(>mm->mmap_sem); + mmap_read_unlock(current->mm); } return (ret != orig_pte); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9a68032d8d73..a96377557db7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2314,7 +2314,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!mm) goto out_put_task; - ret = down_read_killable(>mmap_sem); + ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; @@ -2341,7 +2341,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; - up_read(>mmap_sem); + mmap_read_unlock(mm); mmput(mm); goto out_put_task; } @@ -2350,7 +2350,7 @@
[PATCH v6 00/12] Add a new mmap locking API wrapping mmap_sem calls
Reposting this patch series on top of v5.7-rc6. I think this is ready for inclusion into the -mm tree; however there were some minor points of feedback to address and also it was easier to regenerate a full version after the v5.5 (only updating patches 09/10 and 10/10) caused some confusion. This patch series adds a new mmap locking API replacing the existing mmap_sem lock and unlocks. Initially the API is just implemente in terms of inlined rwsem calls, so it doesn't provide any new functionality. There are two justifications for the new API: - At first, it provides an easy hooking point to instrument mmap_sem locking latencies independently of any other rwsems. - In the future, it may be a starting point for replacing the rwsem implementation with a different one, such as range locks. This is something that is being explored, even though there is no wide concensus about this possible direction yet. (see https://patchwork.kernel.org/cover/11401483/) Changes since v5.5 of the patchset: - Applied the changes on top of v5.7-rc6. This was a straight rebase except for the changes noted here. - Re-generated the coccinelle changes (patch 04/12). - Patch 08/12: use (name) in the MMAP_LOCK_INITIALIZER macro. - Patch 09/12: use lockdep_assert_held() / lockdep_assert_held_write() so that mmap_assert_locked() and mmap_assert_write_locked() get better coverage when lockdep is enabled but CONFIG_DEBUG_VM is not. - Added patches 11 and 12, converting comments that referenced mmap_sem rwsem calls or the mmap_sem lock itself, to reference the corresponding mmap locking APIs or the mmap_lock itself. Changes since v5 of the patchset: - Patch 09/10: Add both mmap_assert_locked() and mmap_assert_write_locked(); convert some call sites that were using lockdep assertions to use these new APIs instead. Changes since v4 of the patchset: - Applied the changes on top of v5.7-rc2. This was a straight rebase except for changes noted here. - Patch 01/10: renamed the mmap_write_downgrade API (as suggested by Davidlohr Bueso). - Patch 05/10: added arch/riscv/mm/pageattr.c changes that had been previously missed, as found by the kbuild bot. - Patch 06/10: use SINGLE_DEPTH_NESTING as suggested by Matthew Wilcox. - Patch 08/10: change MMAP_LOCK_INITIALIZER definition as suggested by Matthew Wilcox. - Patch 09/10: add mm_assert_locked API as suggested by Matthew Wilcox. Changes since v3 of the patchset: - The changes now apply on top of v5.7-rc1. This was a straight rebase except for changes noted here. - Re-generated the coccinelle changes (patch 04/10). - Patch 06/10: removed the mmap_write_unlock_nested API; mmap_write_lock_nested() calls now pair with the regular mmap_write_unlock() as was suggested by many people. - Patch 07/10: removed the mmap_read_release API; this is replaced with mmap_read_trylock_non_owner() which pairs with mmap_read_unlock_non_owner() Thanks to Peter Zijlstra for the suggestion. Changes since v2 of the patchset: - Removed the mmap_is_locked API - v2 had removed all uses of it, but the actual function definition was still there unused. Thanks to Jason Gunthorpe for noticing the unused mmap_is_locked function. Changes since v1 of the patchset: - Manually convert drivers/dma-buf/dma-resv.c ahead of the automated coccinelle conversion as this file requires a new include statement. Thanks to Intel's kbuild test bot for finding the issue. - In coccinelle automated conversion, apply a single coccinelle rule as suggested by Markus Elfring. - In manual conversion of sites missed by coccinelle, fix an issue where I had used mm_read_unlock (from an older version of my patchset) instead of mmap_read_unlock in some arch/mips code. This was also identified by Intel's kbuild test bot. - Do not add a new mmap_is_locked API, and use lockdep_assert_held instead. Thanks to Jason Gunthorpe and Matthew Wilcox for the suggestion. The changes apply on top of v5.7-rc6. I think these changes are ready for integration into the -mm tree now (for integration into v5.8). The coccinelle part of the change is relatively invasive, but can be skipped over on a file by file basis if it causes any conflicts with other pending changes. The new mmap locking API can interoperate with new code that is still using direct rwsem calls, until the last patch in the series which renames mmap_sem to enforce using the new API. Maybe that last patch could be delayed for a bit, so that we'd get a chance to convert any new code that locks mmap_sem in the -rc1 release before applying that last patch. Michel Lespinasse (12): mmap locking API: initial implementation as rwsem wrappers MMU notifier: use the new mmap locking API DMA reservations: use the new mmap locking API mmap locking API: use coccinelle to convert mmap_sem rwsem call sites mmap locking API: convert mmap_sem call sites missed by coccinelle mmap locking API: convert nested write
[PATCH v6 06/12] mmap locking API: convert nested write lock sites
Add API for nested write locks and convert the few call sites doing that. Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- arch/um/include/asm/mmu_context.h | 3 ++- include/linux/mmap_lock.h | 5 + kernel/fork.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 62262c5c7785..17ddd4edf875 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -8,6 +8,7 @@ #include #include +#include #include @@ -47,7 +48,7 @@ static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) * when the new ->mm is used for the first time. */ __switch_mm(>context.id); - down_write_nested(>mmap_sem, 1); + mmap_write_lock_nested(new, SINGLE_DEPTH_NESTING); uml_setup_stubs(new); mmap_write_unlock(new); } diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 97ac53b66052..a757cb30ae77 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -11,6 +11,11 @@ static inline void mmap_write_lock(struct mm_struct *mm) down_write(>mmap_sem); } +static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) +{ + down_write_nested(>mmap_sem, subclass); +} + static inline int mmap_write_lock_killable(struct mm_struct *mm) { return down_write_killable(>mmap_sem); diff --git a/kernel/fork.c b/kernel/fork.c index a4db6bc952c6..e702e84897fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -499,7 +499,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(>mmap_sem, SINGLE_DEPTH_NESTING); + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 03/12] DMA reservations: use the new mmap locking API
This use is converted manually ahead of the next patch in the series, as it requires including a new header which the automated conversion would miss. Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- drivers/dma-buf/dma-resv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index 4264e64788c4..b45f8514dc82 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -34,6 +34,7 @@ #include #include +#include #include /** @@ -109,7 +110,7 @@ static int __init dma_resv_lockdep(void) dma_resv_init(); - down_read(>mmap_sem); + mmap_read_lock(mm); ww_acquire_init(, _ww_class); ret = dma_resv_lock(, ); if (ret == -EDEADLK) @@ -118,7 +119,7 @@ static int __init dma_resv_lockdep(void) fs_reclaim_release(GFP_KERNEL); ww_mutex_unlock(); ww_acquire_fini(); - up_read(>mmap_sem); + mmap_read_unlock(mm); mmput(mm); -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 12/12] mmap locking API: convert mmap_sem comments
Convert comments that reference mmap_sem to reference mmap_lock instead. Signed-off-by: Michel Lespinasse --- .../admin-guide/mm/numa_memory_policy.rst | 10 ++--- Documentation/admin-guide/mm/userfaultfd.rst | 2 +- Documentation/filesystems/locking.rst | 2 +- Documentation/vm/transhuge.rst| 4 +- arch/arc/mm/fault.c | 2 +- arch/arm/kernel/vdso.c| 2 +- arch/arm/mm/fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/microblaze/mm/fault.c| 2 +- arch/nds32/mm/fault.c | 2 +- arch/powerpc/include/asm/pkeys.h | 2 +- arch/powerpc/kvm/book3s_hv_uvmem.c| 6 +-- arch/powerpc/mm/book3s32/tlb.c| 2 +- arch/powerpc/mm/book3s64/hash_pgtable.c | 4 +- arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/powerpc/mm/fault.c | 8 ++-- arch/powerpc/mm/pgtable.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 6 +-- arch/riscv/mm/fault.c | 2 +- arch/s390/kvm/priv.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/s390/mm/gmap.c | 32 +++ arch/s390/mm/pgalloc.c| 2 +- arch/sh/mm/cache-sh4.c| 2 +- arch/sh/mm/fault.c| 2 +- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/skas/mmu.c | 2 +- arch/um/kernel/tlb.c | 2 +- arch/unicore32/mm/fault.c | 2 +- arch/x86/events/core.c| 2 +- arch/x86/include/asm/mmu.h| 2 +- arch/x86/include/asm/pgtable-3level.h | 8 ++-- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c| 6 +-- arch/x86/kernel/ldt.c | 2 +- arch/x86/mm/fault.c | 12 +++--- drivers/char/mspec.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 6 +-- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/ttm/ttm_bo_vm.c | 6 +-- drivers/infiniband/core/uverbs_main.c | 2 +- drivers/infiniband/hw/hfi1/mmu_rb.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 2 +- drivers/misc/cxl/cxllib.c | 2 +- drivers/misc/sgi-gru/grufault.c | 8 ++-- drivers/oprofile/buffer_sync.c| 2 +- drivers/staging/android/ashmem.c | 4 +- drivers/staging/comedi/comedi_fops.c | 2 +- drivers/tty/vt/consolemap.c | 2 +- drivers/xen/gntdev.c | 2 +- fs/coredump.c | 4 +- fs/exec.c | 2 +- fs/ext2/file.c| 2 +- fs/ext4/super.c | 6 +-- fs/kernfs/file.c | 4 +- fs/proc/base.c| 6 +-- fs/proc/task_mmu.c| 6 +-- fs/userfaultfd.c | 18 - fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c| 14 +++ fs/xfs/xfs_iops.c | 4 +- include/asm-generic/pgtable.h | 6 +-- include/linux/fs.h| 4 +- include/linux/huge_mm.h | 2 +- include/linux/mempolicy.h | 2 +- include/linux/mm.h| 10 ++--- include/linux/mm_types.h | 2 +- include/linux/mmu_notifier.h | 8 ++-- include/linux/pagemap.h | 2 +- include/linux/rmap.h | 2 +- include/linux/sched/mm.h | 10 ++--- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c| 4 +- kernel/events/core.c | 6 +-- kernel/events/uprobes.c | 4 +- kernel/exit.c | 2 +- kernel/relay.c| 2 +- kernel/sys.c | 4 +- lib/test_lockup.c | 8 ++-- mm/filemap.c | 38 +- mm/frame_vector.c | 2 +- mm/gup.c | 38 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +-
[PATCH v6 01/12] mmap locking API: initial implementation as rwsem wrappers
This change wraps the existing mmap_sem related rwsem calls into a new mmap locking API. There are two justifications for the new API: - At first, it provides an easy hooking point to instrument mmap_sem locking latencies independently of any other rwsems. - In the future, it may be a starting point for replacing the rwsem implementation with a different one, such as range locks. Signed-off-by: Michel Lespinasse Reviewed-by: Daniel Jordan Reviewed-by: Davidlohr Bueso Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka --- include/linux/mm.h| 1 + include/linux/mmap_lock.h | 54 +++ 2 files changed, 55 insertions(+) create mode 100644 include/linux/mmap_lock.h diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..051ec782bdbb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h new file mode 100644 index ..97ac53b66052 --- /dev/null +++ b/include/linux/mmap_lock.h @@ -0,0 +1,54 @@ +#ifndef _LINUX_MMAP_LOCK_H +#define _LINUX_MMAP_LOCK_H + +static inline void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(>mmap_sem); +} + +static inline void mmap_write_lock(struct mm_struct *mm) +{ + down_write(>mmap_sem); +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + return down_write_killable(>mmap_sem); +} + +static inline bool mmap_write_trylock(struct mm_struct *mm) +{ + return down_write_trylock(>mmap_sem) != 0; +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ + up_write(>mmap_sem); +} + +static inline void mmap_write_downgrade(struct mm_struct *mm) +{ + downgrade_write(>mmap_sem); +} + +static inline void mmap_read_lock(struct mm_struct *mm) +{ + down_read(>mmap_sem); +} + +static inline int mmap_read_lock_killable(struct mm_struct *mm) +{ + return down_read_killable(>mmap_sem); +} + +static inline bool mmap_read_trylock(struct mm_struct *mm) +{ + return down_read_trylock(>mmap_sem) != 0; +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ + up_read(>mmap_sem); +} + +#endif /* _LINUX_MMAP_LOCK_H */ -- 2.26.2.761.g0e0b3e54be-goog
Re: [PATCH] s390/sclp_vt220: Fix console name to match device
On 19.05.20 20:16, Valentin Vidic wrote: > Console name reported in /proc/consoles: > > ttyS1-W- (EC p )4:65 > > does not match device name: > > crw--w1 root root4, 65 May 17 12:18 /dev/ttysclp0 > > so debian-installer gets confused and fails to start. > > Signed-off-by: Valentin Vidic > Cc: sta...@vger.kernel.org This is not as simple. ttyS1 is the the console name and ttysclp0 is the tty name. This has mostly historic reasons and it obviously causes problems. But there is documentation out that that actually describes the use of console=ttyS1 console=ttyS0. to have console output on both sclp consoles and there are probably scripts using ttyS1. I am wondering. The tty for ttyS0 is named sclp_line0. Does this work in LPAR? > --- > drivers/s390/char/sclp_vt220.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c > index 3f9a6ef650fa..3c2ed6d01387 100644 > --- a/drivers/s390/char/sclp_vt220.c > +++ b/drivers/s390/char/sclp_vt220.c > @@ -35,8 +35,8 @@ > #define SCLP_VT220_MINOR 65 > #define SCLP_VT220_DRIVER_NAME "sclp_vt220" > #define SCLP_VT220_DEVICE_NAME "ttysclp" > -#define SCLP_VT220_CONSOLE_NAME "ttyS" > -#define SCLP_VT220_CONSOLE_INDEX 1 /* console=ttyS1 */ > +#define SCLP_VT220_CONSOLE_NAME "ttysclp" > +#define SCLP_VT220_CONSOLE_INDEX 0 /* console=ttysclp0 */ > > /* Representation of a single write request */ > struct sclp_vt220_request { >
Re: [PATCH v4 00/15] virtio-mem: paravirtualized memory
Hi David, Thanks for your work. I tried this version with cloud-hypervisor master. It worked very well. Best, Hui > 2020年5月7日 22:01,David Hildenbrand 写道: > > This series is based on v5.7-rc4. The patches are located at: >https://github.com/davidhildenbrand/linux.git virtio-mem-v4 > > This is basically a resend of v3 [1], now based on v5.7-rc4 and restested. > One patch was reshuffled and two ACKs I missed to add were added. The > rebase did not require any modifications to patches. > > Details about virtio-mem can be found in the cover letter of v2 [2]. A > basic QEMU implementation was posted yesterday [3]. > > [1] https://lkml.kernel.org/r/20200507103119.11219-1-da...@redhat.com > [2] https://lkml.kernel.org/r/20200311171422.10484-1-da...@redhat.com > [3] https://lkml.kernel.org/r/20200506094948.76388-1-da...@redhat.com > > v3 -> v4: > - Move "MAINTAINERS: Add myself as virtio-mem maintainer" to #2 > - Add two ACKs from Andrew (in reply to v2) > -- "mm: Allow to offline unmovable PageOffline() pages via ..." > -- "mm/memory_hotplug: Introduce offline_and_remove_memory()" > > v2 -> v3: > - "virtio-mem: Paravirtualized memory hotplug" > -- Include "linux/slab.h" to fix build issues > -- Remember the "region_size", helpful for patch #11 > -- Minor simplifaction in virtio_mem_overlaps_range() > -- Use notifier_from_errno() instead of notifier_to_errno() in notifier > -- More reliable check for added memory when unloading the driver > - "virtio-mem: Allow to specify an ACPI PXM as nid" > -- Also print the nid > - Added patch #11-#15 > > David Hildenbrand (15): > virtio-mem: Paravirtualized memory hotplug > MAINTAINERS: Add myself as virtio-mem maintainer > virtio-mem: Allow to specify an ACPI PXM as nid > virtio-mem: Paravirtualized memory hotunplug part 1 > virtio-mem: Paravirtualized memory hotunplug part 2 > mm: Allow to offline unmovable PageOffline() pages via >MEM_GOING_OFFLINE > virtio-mem: Allow to offline partially unplugged memory blocks > mm/memory_hotplug: Introduce offline_and_remove_memory() > virtio-mem: Offline and remove completely unplugged memory blocks > virtio-mem: Better retry handling > virtio-mem: Add parent resource for all added "System RAM" > virtio-mem: Drop manual check for already present memory > virtio-mem: Unplug subblocks right-to-left > virtio-mem: Use -ETXTBSY as error code if the device is busy > virtio-mem: Try to unplug the complete online memory block first > > MAINTAINERS |7 + > drivers/acpi/numa/srat.c|1 + > drivers/virtio/Kconfig | 17 + > drivers/virtio/Makefile |1 + > drivers/virtio/virtio_mem.c | 1962 +++ > include/linux/memory_hotplug.h |1 + > include/linux/page-flags.h | 10 + > include/uapi/linux/virtio_ids.h |1 + > include/uapi/linux/virtio_mem.h | 208 > mm/memory_hotplug.c | 81 +- > mm/page_alloc.c | 26 + > mm/page_isolation.c |9 + > 12 files changed, 2314 insertions(+), 10 deletions(-) > create mode 100644 drivers/virtio/virtio_mem.c > create mode 100644 include/uapi/linux/virtio_mem.h > > -- > 2.25.3
Re: [PATCH v12 10/10] KVM: x86: Enable CET virtualization and advertise CET to userspace
On Wed, May 06, 2020 at 04:21:09PM +0800, Yang Weijiang wrote: > Set the feature bits so that CET capabilities can be seen in guest via > CPUID enumeration. Add CR4.CET bit support in order to allow guest set CET > master control bit(CR4.CET). > > Signed-off-by: Yang Weijiang > --- > arch/x86/include/asm/kvm_host.h | 3 ++- > arch/x86/kvm/cpuid.c| 5 +++-- > 2 files changed, 5 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index f68c825e94ad..21f3c89d8c70 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -95,7 +95,8 @@ > | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | > X86_CR4_PCIDE \ > | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ > | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ > - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) > + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ > + | X86_CR4_CET)) > > #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index 984ab2b395b3..333a9e0d7cdf 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -344,7 +344,8 @@ void kvm_set_cpu_caps(void) > F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | > F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | > F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | > - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ > + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | > + F(SHSTK) > ); > /* Set LA57 based on hardware capability. */ > if (cpuid_ecx(7) & F(LA57)) > @@ -353,7 +354,7 @@ void kvm_set_cpu_caps(void) > kvm_cpu_cap_mask(CPUID_7_EDX, > F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | > F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | > - F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) > + F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | F(IBT) SHSTK and IBT need to be disabled in vmx_set_cpu_caps() if unrestricted guest is disabled. CET won't play nice with emulating arbitrary instructions, e.g. KVM doesn't enforce ENDBR and doesn't keep SSP up-to-date (and no one is advocating fully emulating CET). Paolo also floated the idea of providing a reduced opcode set, e.g. only I/O, MOV, and ALU instructions, but I don't think that needs to be done in the initial CET enabling as it's more of a defense-in-depth than a functional requirement. No need to respin a new series just for this, it can wait until I've looked through this version. Original thread: https://lkml.kernel.org/r/20200515161919.29249-1-pbonz...@redhat.com > ); > > /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ > -- > 2.17.2 >
Re: [RFC V2] mm/vmstat: Add events for PMD based THP migration without split
On 2020-05-19 20:32, Anshuman Khandual wrote: ... How about not being quite so granular on the THP config options, and just guarding these events with the overall CONFIG_TRANSPARENT_HUGEPAGE option, instead of the sub-option CONFIG_ARCH_ENABLE_THP_MIGRATION? I tentatively think it's harmless and not really misleading to have /proc/vmstat showing this in all THP-enabled configurations: thp_pmd_migration_success 0 thp_pmd_migration_failure 0 ...if THP is enabled, and *whether or not* _THP_MIGRATION is enabled. And this simplifies things a bit. Given how the .config options can get, I think simplifying would be nice. However, I'm ready to be corrected on that, if it's a bad idea for other API reasons perhaps. Can anyone please comment? There is no THP migration events to track unless it is enabled. Why to show these statistics (as 0) when its not even possible. If the config simplicity is the only intended rationale here, it might not be the case either. These events and their tracking would still need to be wrapped with CONFIG_TRANSPARENT_HUGEPAGE otherwise. If your concern is more towards CONFIG_ARCH_ENABLE_THP_MIGRATION being unsuitable or with complex dependencies, then that is something how THP migration feature itself is implemented currently and adding VM events does not address that. A possible patch in the future patch could solve all these (together). But sure, let's hear it for what others have to say on this. Well, I don't want to hold up progress. If it's not very convincing to you, let's just drop the idea/ It was kind of weak. :) + THP_PMD_MIGRATION_SUCCESS, + THP_PMD_MIGRATION_FAILURE, +#endif #endif #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..5325700a3e90 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1170,6 +1170,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, #define ICE_noinline #endif +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline void thp_migration_success(bool success) I think this should be named thp_pmd_migration_success() , since that's what you're really counting. Or, you could name the events THP_MIGRATION_SUCCESS|FAILURE. Either way, just so the function name matches the events it's counting. Makes sense but IMHO we should keep _pmd_ to be more specific. Will change the name here as thp_pmd_migration_success(). +{ + if (success) + count_vm_event(THP_PMD_MIGRATION_SUCCESS); + else + count_vm_event(THP_PMD_MIGRATION_FAILURE); +} +#else +static inline void thp_migration_success(bool success) { } This whole ifdef clause would disappear if my suggestion above is We will have to protect these with CONFIG_TRANSPARENT_HUGEPAGE as the events are still conditionally available. Yes you are right, of course. And I even worked through that, but then when I sat down to write a response my fingers typed v1 of my understanding instead of v2. No one knows why. :) Sorry about the misinformation there. accepted. However, if not, then I believe the convention for this kind of situation is: static inline void thp_migration_success(bool success) { } AFAIK, we have examples both ways but will change if this is preferred. Not worth worrying about, but I do recall a few recent code reviews that all preferred the multi-line version, which is why I suggested it. Anyway, either way, with the thp_pmd_migration_success() name change, you can add: Reviewed-by: John Hubbard thanks, -- John Hubbard NVIDIA
Re: seccomp feature development
On Wed, May 20, 2020 at 11:20:45AM +1000, Aleksa Sarai wrote: > > No it won't become copy_from_user(), nor will there be a TOCTOU race. > > The idea is that seccomp will proactively copy the struct (and > recursively any of the struct pointers inside) before the syscall runs > -- as this is done by seccomp it doesn't require any copy_from_user() > primitives in cBPF. We then run the cBPF filter on the copied struct, > just like how cBPF programs currently operate on seccomp_data (how this > would be exposed to the cBPF program as part of the seccomp ABI is the > topic of discussion here). > > Then, when the actual syscall code runs, the struct will have already > been copied and the syscall won't copy it again. Let's take bpf syscall as an example. Are you suggesting that all of syscall logic of conditionally parsing the arguments will be copy-pasted into seccomp-syscall infra, then it will do copy_from_user() all the data and replace all aligned_u64 in "union bpf_attr" with kernel copied pointers instead of user pointers and make all of bpf syscall's copy_from_user() actions to be conditional ? If seccomp is on, use kernel pointers... if seccomp is off, do copy_from_user ? And the same idea will be replicated for all syscalls?
Re: [PATCH v3 64/75] x86/sev-es: Cache CPUID results for improved performance
On Tue, Apr 28, 2020 at 05:17:14PM +0200, Joerg Roedel wrote: > From: Mike Stunes > > To avoid a future VMEXIT for a subsequent CPUID function, cache the > results returned by CPUID into an xarray. > > [tl: coding standard changes, register zero extension] > > Signed-off-by: Mike Stunes > Signed-off-by: Tom Lendacky > [ jroe...@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached() >- Used lower_32_bits() where applicable > - Moved cache_index out of struct es_em_ctxt ] > Co-developed-by: Joerg Roedel > Signed-off-by: Joerg Roedel > --- ... > +struct sev_es_cpuid_cache_entry { > + unsigned long eax; > + unsigned long ebx; > + unsigned long ecx; > + unsigned long edx; Why are these unsigned longs? CPUID returns 32-bit values, this wastes 16 bytes per entry. > +}; > + > +static struct xarray sev_es_cpuid_cache; > +static bool __ro_after_init sev_es_cpuid_cache_initialized; > + > /* For early boot hypervisor communication in SEV-ES enabled guests */ > static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); > > @@ -463,6 +474,9 @@ void __init sev_es_init_vc_handling(void) > sev_es_setup_vc_stack(cpu); > } > > + xa_init_flags(_es_cpuid_cache, XA_FLAGS_LOCK_IRQ); > + sev_es_cpuid_cache_initialized = true; > + > init_vc_stack_names(); > } > > @@ -744,6 +758,91 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, > return ret; > } > > +static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt) > +{ > + unsigned long hi, lo; > + > + /* Don't attempt to cache until the xarray is initialized */ > + if (!sev_es_cpuid_cache_initialized) > + return ULONG_MAX; > + > + lo = lower_32_bits(ctxt->regs->ax); > + > + /* > + * CPUID 0x000d requires both RCX and XCR0, so it can't be > + * cached. > + */ > + if (lo == 0x000d) > + return ULONG_MAX; > + > + /* > + * Some callers of CPUID don't always set RCX to zero for CPUID > + * functions that don't require RCX, which can result in excessive > + * cached values, so RCX needs to be manually zeroed for use as part > + * of the cache index. Future CPUID values may need RCX, but since > + * they can't be known, they must not be cached. > + */ > + if (lo > 0x8020) > + return ULONG_MAX; > + > + switch (lo) { > + case 0x0007: OSPKE may or may not be cached correctly depending on when sev_es_cpuid_cache_initialized is set. > + case 0x000b: > + case 0x000f: > + case 0x0010: > + case 0x801d: > + case 0x8020: > + hi = ctxt->regs->cx << 32; > + break; > + default: > + hi = 0; > + } > + > + return hi | lo; This needs to be way more restrictive on what is cached. Unless I've overlooked something, this lets userspace trigger arbitrary, unaccounted kernel memory allocations. E.g. for (i = 0; i <= 0x8020; i++) { for (j = 0; j <= 0x; j++) { cpuid(i, j); if (i != 7 || i != 0xb || i != 0xf || i != 0x10 || i != 0x801d || i != 0x8020) break; } } The whole cache on-demand approach seems like overkill. The number of CPUID leaves that are invoked after boot with any regularity can probably be counted on one hand. IIRC glibc invokes CPUID to gather TLB/cache info, XCR0-based features, and one or two other leafs. A statically sized global array that's arbitrarily index a la x86_capability would be just as simple and more performant. It would also allow fancier things like emulating CPUID 0xD in the guest if you want to go down that road.
Re: [RFC PATCH 0/8] Qualcomm Cloud AI 100 driver
On Tue, May 19, 2020 at 10:41:15PM +0200, Daniel Vetter wrote: > On Tue, May 19, 2020 at 07:41:20PM +0200, Greg Kroah-Hartman wrote: > > On Tue, May 19, 2020 at 08:57:38AM -0600, Jeffrey Hugo wrote: > > > On 5/18/2020 11:08 PM, Dave Airlie wrote: > > > > On Fri, 15 May 2020 at 00:12, Jeffrey Hugo wrote: > > > > > > > > > > Introduction: > > > > > Qualcomm Cloud AI 100 is a PCIe adapter card which contains a > > > > > dedicated > > > > > SoC ASIC for the purpose of efficently running Deep Learning inference > > > > > workloads in a data center environment. > > > > > > > > > > The offical press release can be found at - > > > > > https://www.qualcomm.com/news/releases/2019/04/09/qualcomm-brings-power-efficient-artificial-intelligence-inference > > > > > > > > > > The offical product website is - > > > > > https://www.qualcomm.com/products/datacenter-artificial-intelligence > > > > > > > > > > At the time of the offical press release, numerious technology news > > > > > sites > > > > > also covered the product. Doing a search of your favorite site is > > > > > likely > > > > > to find their coverage of it. > > > > > > > > > > It is our goal to have the kernel driver for the product fully > > > > > upstream. > > > > > The purpose of this RFC is to start that process. We are still doing > > > > > development (see below), and thus not quite looking to gain > > > > > acceptance quite > > > > > yet, but now that we have a working driver we beleive we are at the > > > > > stage > > > > > where meaningful conversation with the community can occur. > > > > > > > > > > > > Hi Jeffery, > > > > > > > > Just wondering what the userspace/testing plans for this driver. > > > > > > > > This introduces a new user facing API for a device without pointers to > > > > users or tests for that API. > > > > > > We have daily internal testing, although I don't expect you to take my > > > word > > > for that. > > > > > > I would like to get one of these devices into the hands of Linaro, so that > > > it can be put into KernelCI. Similar to other Qualcomm products. I'm > > > trying > > > to convince the powers that be to make this happen. > > > > > > Regarding what the community could do on its own, everything but the Linux > > > driver is considered proprietary - that includes the on device firmware > > > and > > > the entire userspace stack. This is a decision above my pay grade. > > > > Ok, that's a decision you are going to have to push upward on, as we > > really can't take this without a working, open, userspace. > > Uh wut. > > So the merge criteria for drivers/accel (atm still drivers/misc but I > thought that was interim until more drivers showed up) isn't actually > "totally-not-a-gpu accel driver without open source userspace". > > Instead it's "totally-not-a-gpu accel driver without open source > userspace" _and_ you have to be best buddies with Greg. Or at least > not be on the naughty company list. Since for habanalabs all you > wanted is a few test cases to exercise the ioctls. Not the entire > userspace. Also, to be fair, I have changed my mind after seeing the mess of complexity that these "ioctls for everyone!" type of pass-through these kinds of drivers are creating. You were right, we need open userspace code in order to be able to properly evaluate and figure out what they are doing is right or not and be able to maintain things over time correctly. So I was wrong, and you were right, my apologies for my previous stubbornness. thanks, greg k-h
Re: [PATCH v4 2/4] kasan: record and print the free track
> On Wed, May 20, 2020 at 6:03 AM Walter Wu wrote: > > > > > On Tue, May 19, 2020 at 4:25 AM Walter Wu > > > wrote: > > > > > > > > Move free track from slub alloc meta-data to slub free meta-data in > > > > order to make struct kasan_free_meta size is 16 bytes. It is a good > > > > size because it is the minimal redzone size and a good number of > > > > alignment. > > > > > > > > For free track in generic KASAN, we do the modification in struct > > > > kasan_alloc_meta and kasan_free_meta: > > > > - remove free track from kasan_alloc_meta. > > > > - add free track into kasan_free_meta. > > > > > > > > [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 > > > > > > > > Signed-off-by: Walter Wu > > > > Suggested-by: Dmitry Vyukov > > > > Cc: Andrey Ryabinin > > > > Cc: Dmitry Vyukov > > > > Cc: Alexander Potapenko > > > > --- > > > > mm/kasan/common.c | 22 ++ > > > > mm/kasan/generic.c | 18 ++ > > > > mm/kasan/kasan.h | 7 +++ > > > > mm/kasan/report.c | 20 > > > > mm/kasan/tags.c| 37 + > > > > 5 files changed, 64 insertions(+), 40 deletions(-) > > > > > > > > diff --git a/mm/kasan/common.c b/mm/kasan/common.c > > > > index 8bc618289bb1..47b53912f322 100644 > > > > --- a/mm/kasan/common.c > > > > +++ b/mm/kasan/common.c > > > > @@ -51,7 +51,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags) > > > > return stack_depot_save(entries, nr_entries, flags); > > > > } > > > > > > > > -static inline void set_track(struct kasan_track *track, gfp_t flags) > > > > +void kasan_set_track(struct kasan_track *track, gfp_t flags) > > > > { > > > > track->pid = current->pid; > > > > track->stack = kasan_save_stack(flags); > > > > @@ -299,24 +299,6 @@ struct kasan_free_meta *get_free_info(struct > > > > kmem_cache *cache, > > > > return (void *)object + cache->kasan_info.free_meta_offset; > > > > } > > > > > > > > - > > > > -static void kasan_set_free_info(struct kmem_cache *cache, > > > > - void *object, u8 tag) > > > > -{ > > > > - struct kasan_alloc_meta *alloc_meta; > > > > - u8 idx = 0; > > > > - > > > > - alloc_meta = get_alloc_info(cache, object); > > > > - > > > > -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY > > > > - idx = alloc_meta->free_track_idx; > > > > - alloc_meta->free_pointer_tag[idx] = tag; > > > > - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; > > > > -#endif > > > > - > > > > - set_track(_meta->free_track[idx], GFP_NOWAIT); > > > > -} > > > > - > > > > void kasan_poison_slab(struct page *page) > > > > { > > > > unsigned long i; > > > > @@ -492,7 +474,7 @@ static void *__kasan_kmalloc(struct kmem_cache > > > > *cache, const void *object, > > > > KASAN_KMALLOC_REDZONE); > > > > > > > > if (cache->flags & SLAB_KASAN) > > > > - set_track(_alloc_info(cache, object)->alloc_track, > > > > flags); > > > > + kasan_set_track(_alloc_info(cache, > > > > object)->alloc_track, flags); > > > > > > > > return set_tag(object, tag); > > > > } > > > > diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c > > > > index 3372bdcaf92a..763d8a13e0ac 100644 > > > > --- a/mm/kasan/generic.c > > > > +++ b/mm/kasan/generic.c > > > > @@ -344,3 +344,21 @@ void kasan_record_aux_stack(void *addr) > > > > alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; > > > > alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); > > > > } > > > > + > > > > +void kasan_set_free_info(struct kmem_cache *cache, > > > > + void *object, u8 tag) > > > > +{ > > > > + struct kasan_free_meta *free_meta; > > > > + > > > > + free_meta = get_free_info(cache, object); > > > > + kasan_set_track(_meta->free_track, GFP_NOWAIT); > > > > +} > > > > + > > > > +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, > > > > + void *object, u8 tag) > > > > +{ > > > > + struct kasan_free_meta *free_meta; > > > > + > > > > + free_meta = get_free_info(cache, object); > > > > + return _meta->free_track; > > > > +} > > > > diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h > > > > index a7391bc83070..ad897ec36545 100644 > > > > --- a/mm/kasan/kasan.h > > > > +++ b/mm/kasan/kasan.h > > > > @@ -127,6 +127,9 @@ struct kasan_free_meta { > > > > * Otherwise it might be used for the allocator freelist. > > > > */ > > > > struct qlist_node quarantine_link; > > > > +#ifdef CONFIG_KASAN_GENERIC > > > > + struct kasan_track free_track; > > > > +#endif > > > > }; > > > > > > > > struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, > > > > @@ -168,6 +171,10 @@ void kasan_report_invalid_free(void *object, > > > > unsigned long ip); > > > > struct page *kasan_addr_to_page(const void *addr); > > > > > > > >
Re: [PATCH] arch/{mips,sparc,microblaze,powerpc}: Don't enable pagefault/preempt twice
On Tue, May 19, 2020 at 12:42:15PM -0700, Guenter Roeck wrote: > On Tue, May 19, 2020 at 11:40:32AM -0700, Ira Weiny wrote: > > On Tue, May 19, 2020 at 09:54:22AM -0700, Guenter Roeck wrote: > > > On Mon, May 18, 2020 at 11:48:43AM -0700, ira.we...@intel.com wrote: > > > > From: Ira Weiny > > > > > > > > The kunmap_atomic clean up failed to remove one set of pagefault/preempt > > > > enables when vaddr is not in the fixmap. > > > > > > > > Fixes: bee2128a09e6 ("arch/kunmap_atomic: consolidate duplicate code") > > > > Signed-off-by: Ira Weiny > > > > > > microblazeel works with this patch, > > > > Awesome... Andrew in my rush yesterday I should have put a reported by on > > the > > patch for Guenter as well. > > > > Sorry about that Guenter, > > No worries. > > > Ira > > > > > as do the nosmp sparc32 boot tests, > > > but sparc32 boot tests with SMP enabled still fail with lots of messages > > > such as: > > > > > > BUG: Bad page state in process swapper/0 pfn:006a1 > > > page:f0933420 refcount:0 mapcount:1 mapping:(ptrval) index:0x1 > > > flags: 0x0() > > > raw: 0100 0122 0001 > > > > > > page dumped because: nonzero mapcount > > > Modules linked in: > > > CPU: 0 PID: 1 Comm: swapper/0 Tainted: GB > > > 5.7.0-rc6-next-20200518-2-gb178d2d56f29 #1 > > > [f00e7ab8 : > > > bad_page+0xa8/0x108 ] > > > [f00e8b54 : > > > free_pcppages_bulk+0x154/0x52c ] > > > [f00ea024 : > > > free_unref_page+0x54/0x6c ] > > > [f00ed864 : > > > free_reserved_area+0x58/0xec ] > > > [f0527104 : > > > kernel_init+0x14/0x110 ] > > > [f000b77c : > > > ret_from_kernel_thread+0xc/0x38 ] > > > [ : > > > 0x0 ] > > > > > > Code path leading to that message is different but always the same > > > from free_unref_page(). Actually it occurs to me that the patch consolidating kmap_prot is odd for sparc 32 bit... Its a long shot but could you try reverting this patch? 4ea7d2419e3f kmap: consolidate kmap_prot definitions Alternately I will need to figure out how to run the sparc on qemu here... Thanks very much for all the testing though! :-D Ira > > > > > > Still testing ppc images. > > > > > ppc image tests are passing with this patch. > > Guenter
[PATCH] MIPS: SGI-IP27: Remove not used includes and comment in ip27-timer.c
After commit 0ce5ebd24d25 ("mfd: ioc3: Add driver for SGI IOC3 chip"), the related includes and comment about ioc3 are not used any more in ip27-timer.c, remove them. Signed-off-by: Tiezhu Yang --- arch/mips/sgi-ip27/ip27-timer.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index 11ffb3e..115b1d9 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -31,10 +30,6 @@ #define TICK_SIZE (tick_nsec / 1000) -/* Includes for ioc3_init(). */ -#include -#include - static int rt_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned int cpu = smp_processor_id(); -- 2.1.0
Re: [RFC PATCH 0/8] Qualcomm Cloud AI 100 driver
On Tue 19 May 21:59 PDT 2020, Greg Kroah-Hartman wrote: > On Tue, May 19, 2020 at 10:41:15PM +0200, Daniel Vetter wrote: > > > Ok, that's a decision you are going to have to push upward on, as we > > > really can't take this without a working, open, userspace. > > > > Uh wut. > > > > So the merge criteria for drivers/accel (atm still drivers/misc but I > > thought that was interim until more drivers showed up) isn't actually > > "totally-not-a-gpu accel driver without open source userspace". > > > > Instead it's "totally-not-a-gpu accel driver without open source > > userspace" _and_ you have to be best buddies with Greg. Or at least > > not be on the naughty company list. Since for habanalabs all you > > wanted is a few test cases to exercise the ioctls. Not the entire > > userspace. > > Habanalabs now has their full library opensourced that their tools use > directly, so that's not an argument anymore. > > My primary point here is the copyright owner of this code, because of > that, I'm not going to objet to allowing this to be merged without open > userspace code. > So because it's copyright Linux Foundation you are going to accept it without user space, after all? Regards, Bjorn
Re: [PATCH 06/12] xen-blkfront: add callbacks for PM suspend and hibernation
Hi Anchal, Thank you for the patch! Yet something to improve: [auto build test ERROR on linus/master] [also build test ERROR on v5.7-rc6] [cannot apply to xen-tip/linux-next tip/irq/core tip/auto-latest next-20200519] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Anchal-Agarwal/Fix-PM-hibernation-in-Xen-guests/20200520-073211 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 03fb3acae4be8a6b680ffedb220a8b6c07260b40 config: x86_64-randconfig-a016-20200519 (attached as .config) compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project e6658079aca6d971b4e9d7137a3a2ecbc9c34aec) reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install x86_64 cross compiling tool for clang build # apt-get install binutils-x86-64-linux-gnu # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot All error/warnings (new ones prefixed by >>, old ones prefixed by <<): >> drivers/block/xen-blkfront.c:2699:30: warning: missing terminating '"' >> character [-Winvalid-pp-token] xenbus_dev_error(dev, err, "Hibernation Failed. ^ >> drivers/block/xen-blkfront.c:2699:30: error: expected expression drivers/block/xen-blkfront.c:2700:26: warning: missing terminating '"' character [-Winvalid-pp-token] The ring is still busy"); ^ >> drivers/block/xen-blkfront.c:2726:1: error: function definition is not >> allowed here { ^ >> drivers/block/xen-blkfront.c:2762:10: error: use of undeclared identifier >> 'blkfront_restore' .thaw = blkfront_restore, ^ drivers/block/xen-blkfront.c:2763:13: error: use of undeclared identifier 'blkfront_restore' .restore = blkfront_restore ^ drivers/block/xen-blkfront.c:2767:1: error: function definition is not allowed here { ^ drivers/block/xen-blkfront.c:2800:1: error: function definition is not allowed here { ^ drivers/block/xen-blkfront.c:2822:1: error: function definition is not allowed here { ^ >> drivers/block/xen-blkfront.c:2863:13: error: use of undeclared identifier >> 'xlblk_init' module_init(xlblk_init); ^ drivers/block/xen-blkfront.c:2867:1: error: function definition is not allowed here { ^ >> drivers/block/xen-blkfront.c:2874:13: error: use of undeclared identifier >> 'xlblk_exit' module_exit(xlblk_exit); ^ >> drivers/block/xen-blkfront.c:2880:24: error: expected '}' MODULE_ALIAS("xenblk"); ^ drivers/block/xen-blkfront.c:2674:1: note: to match this '{' { ^ >> drivers/block/xen-blkfront.c:2738:45: warning: ISO C90 forbids mixing >> declarations and code [-Wdeclaration-after-statement] static const struct block_device_operations xlvbd_block_fops = ^ 3 warnings and 11 errors generated. vim +2699 drivers/block/xen-blkfront.c 2672 2673 static int blkfront_freeze(struct xenbus_device *dev) 2674 { 2675 unsigned int i; 2676 struct blkfront_info *info = dev_get_drvdata(>dev); 2677 struct blkfront_ring_info *rinfo; 2678 /* This would be reasonable timeout as used in xenbus_dev_shutdown() */ 2679 unsigned int timeout = 5 * HZ; 2680 unsigned long flags; 2681 int err = 0; 2682 2683 info->connected = BLKIF_STATE_FREEZING; 2684 2685 blk_mq_freeze_queue(info->rq); 2686 blk_mq_quiesce_queue(info->rq); 2687 2688 for_each_rinfo(info, rinfo, i) { 2689 /* No more gnttab callback work. */ 2690 gnttab_cancel_free_callback(>callback); 2691 /* Flush gnttab callback work. Must be done with no locks held. */ 2692 flush_work(>work); 2693 } 2694 2695 for_each_rinfo(info, rinfo, i) { 2696 spin_lock_irqsave(>ring_lock, flags); 2697 if (RING_FULL(>ring) 2698 || RING_HAS_UNCONSUMED_RESPONSES(>ring)) { > 2699 xenbus_dev_error(dev, err, "Hibernation Failed. 2700 The ring is still busy"); 2701 info->connected = BLKIF_STATE_CONNECTED; 2702 spin_unlock_irqrestore(>ring_lock, flags); 2703 return -EBUSY; 2704 } 2705 spin_unlock_irqrestore(>ring_lock, flags); 2706 } 2707 /* Kick the backend to disconnect */ 2708 xenbus_switch_state(dev, XenbusStat
linux-next boot error: BUG: Invalid wait context ]
Hello, syzbot found the following crash on: HEAD commit:fb57b1fa Add linux-next specific files for 20200519 git tree: linux-next console output: https://syzkaller.appspot.com/x/log.txt?x=17c9196e10 kernel config: https://syzkaller.appspot.com/x/.config?x=2522f758a3588c2d dashboard link: https://syzkaller.appspot.com/bug?extid=08003d278f04ed0944e0 compiler: gcc (GCC) 9.0.0 20181231 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+08003d278f04ed094...@syzkaller.appspotmail.com = [ BUG: Invalid wait context ] 5.7.0-rc6-next-20200519-syzkaller #0 Not tainted - swapper/1/0 is trying to lock: 8880ae737518 (>lock){..-.}-{3:3}, at: spin_lock include/linux/spinlock.h:353 [inline] 8880ae737518 (>lock){..-.}-{3:3}, at: __queue_work+0x2bf/0x1350 kernel/workqueue.c:1448 other info that might help us debug this: context-{2:2} 1 lock held by swapper/1/0: #0: 89bc0040 (rcu_read_lock){}-{1:3}, at: __queue_work+0x175/0x1350 kernel/workqueue.c:1411 stack backtrace: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.7.0-rc6-next-20200519-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_lock_invalid_wait_context kernel/locking/lockdep.c:3988 [inline] check_wait_context kernel/locking/lockdep.c:4049 [inline] __lock_acquire.cold+0x26c/0x458 kernel/locking/lockdep.c:4286 lock_acquire+0x20e/0x960 kernel/locking/lockdep.c:4915 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:353 [inline] __queue_work+0x2bf/0x1350 kernel/workqueue.c:1448 queue_work_on+0x18b/0x200 kernel/workqueue.c:1517 tick_nohz_activate kernel/time/tick-sched.c:1244 [inline] tick_nohz_activate kernel/time/tick-sched.c:1237 [inline] tick_setup_sched_timer+0x20e/0x380 kernel/time/tick-sched.c:1378 hrtimer_switch_to_hres kernel/time/hrtimer.c:739 [inline] hrtimer_run_queues+0x327/0x3e0 kernel/time/hrtimer.c:1746 run_local_timers+0x49/0x130 kernel/time/timer.c:1798 update_process_times+0x1e/0x60 kernel/time/timer.c:1725 tick_periodic+0x79/0x170 kernel/time/tick-common.c:99 tick_handle_periodic+0x41/0x130 kernel/time/tick-common.c:111 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1080 [inline] smp_apic_timer_interrupt+0x1ad/0x6a0 arch/x86/kernel/apic/apic.c:1105 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:828 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: cc cc cc cc cc cc cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 74 91 59 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 64 91 59 00 fb f4 cc 48 b8 00 00 00 00 00 fc ff df 41 57 41 56 41 55 41 54 55 53 RSP: :c9d3fd50 EFLAGS: 0286 ORIG_RAX: ff13 RAX: 11369a43 RBX: 1920001a7fab RCX: RDX: dc00 RSI: 0006 RDI: 8880a95f0c0c RBP: dc00 R08: 8880a95f0340 R09: R10: R11: R12: ed10152be068 R13: 0001 R14: 8aabeb08 R15: arch_safe_halt arch/x86/include/asm/paravirt.h:150 [inline] default_idle+0x91/0x3d0 arch/x86/kernel/process.c:708 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x393/0x690 kernel/sched/idle.c:269 cpu_startup_entry+0x14/0x20 kernel/sched/idle.c:361 start_secondary+0x2f8/0x410 arch/x86/kernel/smpboot.c:268 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 random: fast init done random: 7 urandom warning(s) missed due to ratelimiting --- This bug is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ for more information about syzbot. syzbot engineers can be reached at syzkal...@googlegroups.com. syzbot will keep track of this bug report. See: https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
Re: [PATCH] arch/{mips,sparc,microblaze,powerpc}: Don't enable pagefault/preempt twice
On Tue, May 19, 2020 at 12:42:15PM -0700, Guenter Roeck wrote: > On Tue, May 19, 2020 at 11:40:32AM -0700, Ira Weiny wrote: > > On Tue, May 19, 2020 at 09:54:22AM -0700, Guenter Roeck wrote: > > > On Mon, May 18, 2020 at 11:48:43AM -0700, ira.we...@intel.com wrote: > > > > From: Ira Weiny > > > > > > > > The kunmap_atomic clean up failed to remove one set of pagefault/preempt > > > > enables when vaddr is not in the fixmap. > > > > > > > > Fixes: bee2128a09e6 ("arch/kunmap_atomic: consolidate duplicate code") > > > > Signed-off-by: Ira Weiny > > > > > > microblazeel works with this patch, > > > > Awesome... Andrew in my rush yesterday I should have put a reported by on > > the > > patch for Guenter as well. > > > > Sorry about that Guenter, > > No worries. > > > Ira > > > > > as do the nosmp sparc32 boot tests, > > > but sparc32 boot tests with SMP enabled still fail with lots of messages > > > such as: > > > > > > BUG: Bad page state in process swapper/0 pfn:006a1 > > > page:f0933420 refcount:0 mapcount:1 mapping:(ptrval) index:0x1 > > > flags: 0x0() > > > raw: 0100 0122 0001 > > > > > > page dumped because: nonzero mapcount > > > Modules linked in: > > > CPU: 0 PID: 1 Comm: swapper/0 Tainted: GB > > > 5.7.0-rc6-next-20200518-2-gb178d2d56f29 #1 > > > [f00e7ab8 : > > > bad_page+0xa8/0x108 ] > > > [f00e8b54 : > > > free_pcppages_bulk+0x154/0x52c ] > > > [f00ea024 : > > > free_unref_page+0x54/0x6c ] > > > [f00ed864 : > > > free_reserved_area+0x58/0xec ] > > > [f0527104 : > > > kernel_init+0x14/0x110 ] > > > [f000b77c : > > > ret_from_kernel_thread+0xc/0x38 ] > > > [ : > > > 0x0 ] I'm really not seeing how this is related to the kmap clean up. But just to make sure I'm trying to run your environment for sparc and having less luck than with microblaze. Could you give me the command which is failing above? Ira > > > > > > Code path leading to that message is different but always the same > > > from free_unref_page(). > > > > > > Still testing ppc images. > > > > > ppc image tests are passing with this patch. > > Guenter
Re: [RFC PATCH v3 2/2] CPPC: add support for SW BOOST
On 19-05-20, 19:41, Xiongfeng Wang wrote: > To add SW BOOST support for CPPC, we need to get the max frequency of > boost mode and non-boost mode. ACPI spec 6.2 section 8.4.7.1 describe > the following two CPC registers. > > "Highest performance is the absolute maximum performance an individual > processor may reach, assuming ideal conditions. This performance level > may not be sustainable for long durations, and may only be achievable if > other platform components are in a specific state; for example, it may > require other processors be in an idle state. > > Nominal Performance is the maximum sustained performance level of the > processor, assuming ideal operating conditions. In absence of an > external constraint (power, thermal, etc.) this is the performance level > the platform is expected to be able to maintain continuously. All > processors are expected to be able to sustain their nominal performance > state simultaneously." > > To add SW BOOST support for CPPC, we can use Highest Performance as the > max performance in boost mode and Nominal Performance as the max > performance in non-boost mode. If the Highest Performance is greater > than the Nominal Performance, we assume SW BOOST is supported. > > The current CPPC driver does not support SW BOOST and use 'Highest > Performance' as the max performance the CPU can achieve. 'Nominal > Performance' is used to convert 'performance' to 'frequency'. That > means, if firmware enable boost and provide a value for Highest > Performance which is greater than Nominal Performance, boost feature is > enabled by default. > > Because SW BOOST is disabled by default, so, after this patch, boost > feature is disabled by default even if boost is enabled by firmware. > > Signed-off-by: Xiongfeng Wang > --- > drivers/cpufreq/cppc_cpufreq.c | 39 +-- > 1 file changed, 37 insertions(+), 2 deletions(-) > > diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > index bda0b24..792ed9e 100644 > --- a/drivers/cpufreq/cppc_cpufreq.c > +++ b/drivers/cpufreq/cppc_cpufreq.c > @@ -37,6 +37,7 @@ > * requested etc. > */ > static struct cppc_cpudata **all_cpu_data; > +static bool boost_supported; > > struct cppc_workaround_oem_info { > char oem_id[ACPI_OEM_ID_SIZE + 1]; > @@ -310,7 +311,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy > *policy) >* Section 8.4.7.1.1.5 of ACPI 6.1 spec) >*/ > policy->min = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.lowest_nonlinear_perf); > - policy->max = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.highest_perf); > + policy->max = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.nominal_perf); > > /* >* Set cpuinfo.min_freq to Lowest to make the full range of performance > @@ -318,7 +319,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy > *policy) >* nonlinear perf >*/ > policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.lowest_perf); > - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.highest_perf); > + policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.nominal_perf); > > policy->transition_delay_us = > cppc_cpufreq_get_transition_delay_us(cpu_num); > policy->shared_type = cpu->shared_type; > @@ -343,6 +344,13 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy > *policy) > > cpu->cur_policy = policy; > > + /* > + * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost > + * is supported. > + */ > + if (cpu->perf_caps.highest_perf > cpu->perf_caps.nominal_perf) > + boost_supported = true; > + > /* Set policy->cur to max now. The governors will adjust later. */ > policy->cur = cppc_cpufreq_perf_to_khz(cpu, > cpu->perf_caps.highest_perf); > @@ -410,6 +418,32 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int > cpunum) > return cppc_get_rate_from_fbctrs(cpu, fb_ctrs_t0, fb_ctrs_t1); > } > > +static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) > +{ > + struct cppc_cpudata *cpudata; > + int ret = 0; No need to initialize this. > + > + if (!boost_supported) { > + pr_err("BOOST not supported by CPU or firmware\n"); > + return -EINVAL; > + } > + > + cpudata = all_cpu_data[policy->cpu]; > + if (state) > + policy->max = cppc_cpufreq_perf_to_khz(cpudata, > + cpudata->perf_caps.highest_perf); > + else > + policy->max = cppc_cpufreq_perf_to_khz(cpudata, > + cpudata->perf_caps.nominal_perf); > + policy->cpuinfo.max_freq = policy->max; > + > + ret = freq_qos_update_request(policy->max_freq_req, policy->max); > + if (ret < 0) > + return ret; > + > +
Re: [PATCH 2/2] Add a new sysctl knob: unprivileged_userfaultfd_user_mode_only
Hello everyone, On Fri, May 08, 2020 at 12:54:03PM -0400, Michael S. Tsirkin wrote: > On Fri, May 08, 2020 at 12:52:34PM -0400, Michael S. Tsirkin wrote: > > On Wed, Apr 22, 2020 at 05:26:32PM -0700, Daniel Colascione wrote: > > > This sysctl can be set to either zero or one. When zero (the default) > > > the system lets all users call userfaultfd with or without > > > UFFD_USER_MODE_ONLY, modulo other access controls. When > > > unprivileged_userfaultfd_user_mode_only is set to one, users without > > > CAP_SYS_PTRACE must pass UFFD_USER_MODE_ONLY to userfaultfd or the API > > > will fail with EPERM. This facility allows administrators to reduce > > > the likelihood that an attacker with access to userfaultfd can delay > > > faulting kernel code to widen timing windows for other exploits. > > > > > > Signed-off-by: Daniel Colascione > > > > The approach taken looks like a hard-coded security policy. > > For example, it won't be possible to set the sysctl knob > > in question on any sytem running kvm. So this is > > no good for any general purpose system. > > > > What's wrong with using a security policy for this instead? > > In fact I see the original thread already mentions selinux, > so it's just a question of making this controllable by > selinux. I agree it'd be preferable if it was not hardcoded, but then this patchset is also much simpler than the previous controlling it through selinux.. I was thinking, an alternative policy that could control it without hard-coding it, is a seccomp-bpf filter, then you can drop 2/2 as well, not just 1/6-4/6. If you keep only 1/2, can't seccomp-bpf enforce userfaultfd to be always called with flags==0x1 without requiring extra modifications in the kernel? Can't you get the feature party with the CAP_SYS_PTRACE capability too, if you don't wrap those tasks with the ptrace capability under that seccomp filter? As far as I can tell, it's unprecedented to create a flag for a syscall API, with the only purpose of implementing a seccomp-bpf filter verifying such flag is set, but then if you want to control it with LSM it's even more complex than doing it with seccomp-bpf, and it requires more kernel code too. We could always add 2/2 later, such possibility won't disappear, in fact we could also add 1/6-4/6 later too if that is not enough. If we could begin by merging only 1/2 from this new series and be done with the kernel changes, because we offload the rest of the work to the kernel eBPF JIT, I think it'd be ideal. Thanks, Andrea
Re: [RFC PATCH v3 1/2] cpufreq: change '.set_boost' to act on only one policy
On 19-05-20, 19:41, Xiongfeng Wang wrote: > Macro 'for_each_active_policy()' is defined internally. To avoid some > cpufreq driver needing this macro to iterate over all the policies in > '.set_boost' callback, we redefine '.set_boost' to act on only one > policy and pass the policy as an argument. > 'cpufreq_boost_trigger_state()' iterate over all the policies to set > boost for the system. This is preparation for adding SW BOOST support > for CPPC. > > Signed-off-by: Xiongfeng Wang > --- > drivers/cpufreq/acpi-cpufreq.c | 4 ++-- > drivers/cpufreq/cpufreq.c | 53 > +- > include/linux/cpufreq.h| 2 +- > 3 files changed, 30 insertions(+), 29 deletions(-) > > diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c > index 289e8ce..b0a9eb5 100644 > --- a/drivers/cpufreq/acpi-cpufreq.c > +++ b/drivers/cpufreq/acpi-cpufreq.c > @@ -126,7 +126,7 @@ static void boost_set_msr_each(void *p_en) > boost_set_msr(enable); > } > > -static int set_boost(int val) > +static int set_boost(struct cpufreq_policy *policy, int val) > { > get_online_cpus(); > on_each_cpu(boost_set_msr_each, (void *)(long)val, 1); I think (Rafael can confirm), that you need to update this as well. You don't need to run for each cpu now, but for each CPU in the policy. > @@ -162,7 +162,7 @@ static ssize_t store_cpb(struct cpufreq_policy *policy, > const char *buf, > if (ret || val > 1) > return -EINVAL; > > - set_boost(val); > + set_boost(policy, val); > > return count; > } > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > index d03f250..d0d86b1 100644 > --- a/drivers/cpufreq/cpufreq.c > +++ b/drivers/cpufreq/cpufreq.c > @@ -2532,34 +2532,29 @@ void cpufreq_update_limits(unsigned int cpu) > /* > * BOOST* > */ > -static int cpufreq_boost_set_sw(int state) > +static int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) > { > - struct cpufreq_policy *policy; > - > - for_each_active_policy(policy) { > - int ret; > - > - if (!policy->freq_table) > - return -ENXIO; > + int ret; > > - ret = cpufreq_frequency_table_cpuinfo(policy, > - policy->freq_table); > - if (ret) { > - pr_err("%s: Policy frequency update failed\n", > -__func__); > - return ret; > - } > + if (!policy->freq_table) > + return -ENXIO; > > - ret = freq_qos_update_request(policy->max_freq_req, > policy->max); > - if (ret < 0) > - return ret; > + ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); > + if (ret) { > + pr_err("%s: Policy frequency update failed\n", __func__); > + return ret; > } > > + ret = freq_qos_update_request(policy->max_freq_req, policy->max); > + if (ret < 0) > + return ret; > + > return 0; > } > > int cpufreq_boost_trigger_state(int state) > { > + struct cpufreq_policy *policy; > unsigned long flags; > int ret = 0; > > @@ -2570,16 +2565,22 @@ int cpufreq_boost_trigger_state(int state) > cpufreq_driver->boost_enabled = state; > write_unlock_irqrestore(_driver_lock, flags); > > - ret = cpufreq_driver->set_boost(state); > - if (ret) { > - write_lock_irqsave(_driver_lock, flags); > - cpufreq_driver->boost_enabled = !state; > - write_unlock_irqrestore(_driver_lock, flags); > - > - pr_err("%s: Cannot %s BOOST\n", > -__func__, state ? "enable" : "disable"); > + for_each_active_policy(policy) { > + ret = cpufreq_driver->set_boost(policy, state); > + if (ret) > + goto err_reset_state; > } > > + return 0; > + > +err_reset_state: > + write_lock_irqsave(_driver_lock, flags); > + cpufreq_driver->boost_enabled = !state; > + write_unlock_irqrestore(_driver_lock, flags); > + > + pr_err("%s: Cannot %s BOOST\n", > +__func__, state ? "enable" : "disable"); > + > return ret; > } > > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h > index 67d5950..3494f67 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -367,7 +367,7 @@ struct cpufreq_driver { > > /* platform specific boost support code */ > boolboost_enabled; > - int (*set_boost)(int state); > + int (*set_boost)(struct cpufreq_policy *policy, int state); > }; > > /* flags */ > -- > 1.7.12.4 --
Re: [RFC PATCH 2/2] init: Allow multi-line output of kernel command line
On Wed, 2020-05-20 at 13:41 +0900, Sergey Senozhatsky wrote: > On (20/05/19 12:42), Joe Perches wrote: > > +static void __init print_cmdline(char *line) > > +{ > > +#ifdef CONFIG_PRINTK > > + const char *prefix = "Kernel command line"; > > + size_t len = strlen(line); > > + > > + while (len > PRINTK_LOG_LINE_MAX) { > > + char *pos = line; > > + char *last_pos = pos + PRINTK_LOG_LINE_MAX - 1; > > + char saved_char; > > + /* Find last space char within the maximum line length */ > > + while ((pos = memchr(pos, ' ', len - (pos - line))) && > > + (pos - line) < PRINTK_LOG_LINE_MAX - 1) { > > Don't you need to also count in the 'prefix' length? yup. > > + last_pos = pos; > > + } > > + saved_char = line[last_pos - line]; > > + line[last_pos - line] = 0; > > + pr_notice("%s: %s\n", prefix, line); > > + prefix = "Kernel command line (continued)"; > > + line[last_pos - line] = saved_char; > > + len -= pos - line; > > + line += pos - line; > > + } > > + > > + pr_notice("%s: %s\n", prefix, line); > > +#endif > > +} > > I like this in general. And I agree that we better handle this > externally, on the printk() caller side, so that printk() will > still have sane limits and won't print a 1G string for example. > > I wonder if we need to export PRINTK_LOG_LINE_MAX. I think a #define works well enough.( > Maybe we can > use here something rather random and much shorter instead. E.g. > 256 chars. Hmm. How min(some_max like 132/256, PRINTK_LOG_LINE_MAX) would work. > many crash/monitoring tools can get confused > by multiple "Kernel command line" prefixes? I doubt any as it's an init only function.
Re: [PATCH] MIPS: SGI-IP27: Remove duplicated include in ip27-timer.c
On 05/20/2020 12:03 AM, Thomas Bogendoerfer wrote: On Tue, May 19, 2020 at 08:28:11PM +0800, Tiezhu Yang wrote: After commit 9d0aaf98dc24 ("MIPS: SGI-IP27: Move all shared IP27 declarations to ip27-common.h"), ip27-common.h is included more than once in ip27-timer.c, remove it. Signed-off-by: Tiezhu Yang applied to mips-next. I only removed the second #include. If you want to clean this up further the includes and comment about ioc3_init() could be removed as well. OK, thank you. I will do it later. Thanks, Tiezhu Yang Thomas.
Re: [RFC PATCH 0/8] Qualcomm Cloud AI 100 driver
On Tue, May 19, 2020 at 10:41:15PM +0200, Daniel Vetter wrote: > > Ok, that's a decision you are going to have to push upward on, as we > > really can't take this without a working, open, userspace. > > Uh wut. > > So the merge criteria for drivers/accel (atm still drivers/misc but I > thought that was interim until more drivers showed up) isn't actually > "totally-not-a-gpu accel driver without open source userspace". > > Instead it's "totally-not-a-gpu accel driver without open source > userspace" _and_ you have to be best buddies with Greg. Or at least > not be on the naughty company list. Since for habanalabs all you > wanted is a few test cases to exercise the ioctls. Not the entire > userspace. Habanalabs now has their full library opensourced that their tools use directly, so that's not an argument anymore. My primary point here is the copyright owner of this code, because of that, I'm not going to objet to allowing this to be merged without open userspace code. thanks, greg k-h
Re: [PATCH net-next v1 1/2] ethtool: provide UAPI for PHY Signal Quality Index (SQI)
On Tue, May 19, 2020 at 04:03:48PM +0200, Andrew Lunn wrote: > > --- a/net/ethtool/common.c > > +++ b/net/ethtool/common.c > > @@ -310,6 +310,16 @@ int __ethtool_get_link(struct net_device *dev) > > return netif_running(dev) && dev->ethtool_ops->get_link(dev); > > } > > > > +int __ethtool_get_sqi(struct net_device *dev) > > +{ > > + struct phy_device *phydev = dev->phydev; > > + > > + if (!phydev->drv->get_sqi) > > + return -EOPNOTSUPP; > > + > > + return phydev->drv->get_sqi(phydev); > > +} > > + > > You are only providing access via netlink ethtool? There is no ioctl > method to get this. ack > If so, i wonder if common.c is the correct place > for this, or if it should be moved into linkstate.c. You can then drop > the __. ok -- Pengutronix e.K. | | Steuerwalder Str. 21 | http://www.pengutronix.de/ | 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | signature.asc Description: PGP signature
Re: [PATCH net-next v1 1/2] ethtool: provide UAPI for PHY Signal Quality Index (SQI)
On Tue, May 19, 2020 at 03:26:30PM +0200, Andrew Lunn wrote: > On Tue, May 19, 2020 at 09:51:59AM +0200, Oleksij Rempel wrote: > > Signal Quality Index is a mandatory value required by "OPEN Alliance > > SIG" for the 100Base-T1 PHYs [1]. This indicator can be used for cable > > integrity diagnostic and investigating other noise sources and > > implement by at least two vendors: NXP[2] and TI[3]. > > Hi Oleksij > > With a multi part patch set, please always include a cover note, > describing what the patchset as a whole does. ok > > +int __ethtool_get_sqi(struct net_device *dev) > > +{ > > + struct phy_device *phydev = dev->phydev; > > + > > + if (!phydev->drv->get_sqi) > > + return -EOPNOTSUPP; > > + > > + return phydev->drv->get_sqi(phydev); > > +} > > You are not doing any locking here, which you should. Due to modules > vs built in, it can be a bit tricky getting this right. Take a look at > how ethtool ioctl.c uses phy_ethtool_get_stats() and that inline > function itself. ok. -- Pengutronix e.K. | | Steuerwalder Str. 21 | http://www.pengutronix.de/ | 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | signature.asc Description: PGP signature
general protection fault in kobject_get (2)
Hello, syzbot found the following crash on: HEAD commit:d00f26b6 Merge git://git.kernel.org/pub/scm/linux/kernel/g.. git tree: net-next console output: https://syzkaller.appspot.com/x/log.txt?x=1316343c10 kernel config: https://syzkaller.appspot.com/x/.config?x=26d0bd769afe1a2c dashboard link: https://syzkaller.appspot.com/bug?extid=407fd358a932bbf639c6 compiler: gcc (GCC) 9.0.0 20181231 (experimental) Unfortunately, I don't have any reproducer for this crash yet. IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+407fd358a932bbf63...@syzkaller.appspotmail.com general protection fault, probably for non-canonical address 0xdc13: [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0098-0x009f] CPU: 1 PID: 16682 Comm: syz-executor.3 Not tainted 5.7.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kobject_get+0x30/0x150 lib/kobject.c:640 Code: 53 e8 d4 7e c6 fd 4d 85 e4 0f 84 a2 00 00 00 e8 c6 7e c6 fd 49 8d 7c 24 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 e7 00 00 00 RSP: 0018:c9000772f240 EFLAGS: 00010203 RAX: dc00 RBX: 85acfca0 RCX: c9000fc67000 RDX: 0013 RSI: 83acadfa RDI: 009c RBP: 0060 R08: 8880a8dfa4c0 R09: ed100a03f403 R10: 8880501fa017 R11: ed100a03f402 R12: 0060 R13: c9000772f3c0 R14: 88805d1ec4e8 R15: 88805d1ec580 FS: 7f1ebed26700() GS:8880ae70() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 004d88f0 CR3: a86c4000 CR4: 001406e0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: get_device+0x20/0x30 drivers/base/core.c:2620 __ib_get_client_nl_info+0x1d4/0x2a0 drivers/infiniband/core/device.c:1863 ib_get_client_nl_info+0x30/0x180 drivers/infiniband/core/device.c:1883 nldev_get_chardev+0x52b/0xa40 drivers/infiniband/core/nldev.c:1625 rdma_nl_rcv_msg drivers/infiniband/core/netlink.c:195 [inline] rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x586/0x900 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x537/0x740 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x882/0xe10 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 sys_sendmsg+0x6e6/0x810 net/socket.c:2352 ___sys_sendmsg+0x100/0x170 net/socket.c:2406 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2439 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 RIP: 0033:0x45c829 Code: 0d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:7f1ebed25c78 EFLAGS: 0246 ORIG_RAX: 002e RAX: ffda RBX: 004ff720 RCX: 0045c829 RDX: RSI: 2200 RDI: 0003 RBP: 0078bf00 R08: R09: R10: R11: 0246 R12: R13: 09ad R14: 004d5f10 R15: 7f1ebed266d4 Modules linked in: ---[ end trace 239938a6c4c3c99f ]--- RIP: 0010:kobject_get+0x30/0x150 lib/kobject.c:640 Code: 53 e8 d4 7e c6 fd 4d 85 e4 0f 84 a2 00 00 00 e8 c6 7e c6 fd 49 8d 7c 24 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 e7 00 00 00 RSP: 0018:c9000772f240 EFLAGS: 00010203 RAX: dc00 RBX: 85acfca0 RCX: c9000fc67000 RDX: 0013 RSI: 83acadfa RDI: 009c RBP: 0060 R08: 8880a8dfa4c0 R09: ed100a03f403 R10: 8880501fa017 R11: ed100a03f402 R12: 0060 R13: c9000772f3c0 R14: 88805d1ec4e8 R15: 88805d1ec580 FS: 7f1ebed26700() GS:8880ae70() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 0073fad4 CR3: a86c4000 CR4: 001406e0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 --- This bug is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ for more information about syzbot. syzbot engineers can be reached at syzkal...@googlegroups.com. syzbot will keep track of this bug report. See: https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
Re: [PATCH] mailbox: imx: Disable the clock on devm_mbox_controller_register() failure
On Wed, May 20, 2020 at 12:22:46AM -0300, Fabio Estevam wrote: > devm_mbox_controller_register() may fail, and in the case of failure the > priv->clk clock that was previously enabled, should be disabled. > > Fixes: 2bb7005696e2 ("mailbox: Add support for i.MX messaging unit") > Signed-off-by: Fabio Estevam Acked-by: Oleksij Rempel > --- > drivers/mailbox/imx-mailbox.c | 8 +++- > 1 file changed, 7 insertions(+), 1 deletion(-) > > diff --git a/drivers/mailbox/imx-mailbox.c b/drivers/mailbox/imx-mailbox.c > index 7906624a731c..3f7c4548c18f 100644 > --- a/drivers/mailbox/imx-mailbox.c > +++ b/drivers/mailbox/imx-mailbox.c > @@ -508,7 +508,13 @@ static int imx_mu_probe(struct platform_device *pdev) > > platform_set_drvdata(pdev, priv); > > - return devm_mbox_controller_register(dev, >mbox); > + ret = devm_mbox_controller_register(dev, >mbox); > + if (ret) { > + clk_disable_unprepare(priv->clk); > + return ret; > + } > + > + return 0; > } > > static int imx_mu_remove(struct platform_device *pdev) > -- > 2.17.1 > > -- Pengutronix e.K. | | Steuerwalder Str. 21 | http://www.pengutronix.de/ | 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | signature.asc Description: PGP signature
Re: [RFC][PATCH 5/5] thermal: int340x: Use new device interface
On Mon, May 4, 2020 at 11:47 PM Srinivas Pandruvada wrote: > > Use the new framework to send notifications for: > - Setting temperature threshold for notification to avoid polling > - Send THERMAL_TRIP_REACHED event on reaching threshold > - Send THERMAL_TRIP_UPDATE when firmware change the the existing trip > temperature I am a little confused here. I would've expected the thermal core to send the THERMAL_TRIP_* notifications, not platform drivers. Why shouldn't this be done in thermal core? > > Signed-off-by: Srinivas Pandruvada > --- > .../intel/int340x_thermal/int3403_thermal.c | 3 ++ > .../int340x_thermal/int340x_thermal_zone.c| 29 +++ > .../int340x_thermal/int340x_thermal_zone.h| 7 + > .../processor_thermal_device.c| 1 + > 4 files changed, 40 insertions(+) > > diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c > b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c > index f86cbb125e2f..77c014a113a4 100644 > --- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c > +++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c > @@ -63,15 +63,18 @@ static void int3403_notify(acpi_handle handle, > > switch (event) { > case INT3403_PERF_CHANGED_EVENT: > + int340x_thermal_send_user_event(obj->int340x_zone, > THERMAL_PERF_CHANGED, 0); > break; > case INT3403_THERMAL_EVENT: > int340x_thermal_zone_device_update(obj->int340x_zone, >THERMAL_TRIP_VIOLATED); > + int340x_thermal_send_user_event(obj->int340x_zone, > THERMAL_TRIP_REACHED, 0); > break; > case INT3403_PERF_TRIP_POINT_CHANGED: > int340x_thermal_read_trips(obj->int340x_zone); > int340x_thermal_zone_device_update(obj->int340x_zone, >THERMAL_TRIP_CHANGED); > + int340x_thermal_send_user_event(obj->int340x_zone, > THERMAL_TRIP_UPDATE, 0); > break; > default: > dev_err(>pdev->dev, "Unsupported event [0x%x]\n", > event); > diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c > b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c > index 432213272f1e..9568a2db7afd 100644 > --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c > +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c > @@ -146,12 +146,41 @@ static int int340x_thermal_get_trip_hyst(struct > thermal_zone_device *zone, > return 0; > } > > +static int int340x_thermal_get_thres_low(struct thermal_zone_device *zone, > int *temp) > +{ > + struct int34x_thermal_zone *d = zone->devdata; > + > + *temp = d->aux_trips[0]; > + > + return 0; > +} > + > +static int int340x_thermal_set_thres_low(struct thermal_zone_device *zone, > int temp) > +{ > + struct int34x_thermal_zone *d = zone->devdata; > + acpi_status status; > + > + if (d->override_ops && d->override_ops->set_trip_temp) > + return d->override_ops->set_trip_temp(zone, 0, temp); > + > + status = acpi_execute_simple_method(d->adev->handle, "PAT0", > + millicelsius_to_deci_kelvin(temp)); > + if (ACPI_FAILURE(status)) > + return -EIO; > + > + d->aux_trips[0] = temp; > + > + return 0; > +} > + > static struct thermal_zone_device_ops int340x_thermal_zone_ops = { > .get_temp = int340x_thermal_get_zone_temp, > .get_trip_temp = int340x_thermal_get_trip_temp, > .get_trip_type = int340x_thermal_get_trip_type, > .set_trip_temp = int340x_thermal_set_trip_temp, > .get_trip_hyst = int340x_thermal_get_trip_hyst, > + .set_temp_thres_low = int340x_thermal_set_thres_low, > + .get_temp_thres_low = int340x_thermal_get_thres_low, > }; > > static int int340x_thermal_get_trip_config(acpi_handle handle, char *name, > diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h > b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h > index 3b4971df1b33..142027e4955f 100644 > --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h > +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.h > @@ -58,4 +58,11 @@ static inline void int340x_thermal_zone_device_update( > thermal_zone_device_update(tzone->zone, event); > } > > +static inline void int340x_thermal_send_user_event( > + struct int34x_thermal_zone *tzone, > + enum thermal_device_events event, > + u64 data) > +{ > + thermal_dev_send_event(tzone->zone->id, event, data); > +} > #endif > diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c >
Re: [PATCH v1] usb: musb: dsps: set MUSB_DA8XX quirk for AM335x
On Tue, May 19, 2020 at 05:18:51PM -0500, Bin Liu wrote: > Hi, > > On Fri, Mar 27, 2020 at 06:38:49AM +0100, Oleksij Rempel wrote: > > Beagle Bone Black has different memory corruptions if kernel is > > configured with USB_TI_CPPI41_DMA=y. This issue is reproducible with > > ath9k-htc driver (ar9271 based wifi usb controller): > > > > root@AccessBox:~ iw dev wlan0 set monitor fcsfail otherbss > > root@AccessBox:~ ip l s dev wlan0 up > > kmemleak: Cannot insert 0xda577e40 into the object search tree (overlaps > > existing) > > CPU: 0 PID: 176 Comm: ip Not tainted 5.5.0 #7 > > Hardware name: Generic AM33XX (Flattened Device Tree) > > [] (unwind_backtrace) from [] (show_stack+0x18/0x1c) > > [] (show_stack) from [] (dump_stack+0x84/0x98) > > [] (dump_stack) from [] (create_object+0x2f8/0x324) > > [] (create_object) from [] > > (kmem_cache_alloc+0x1a8/0x39c) > > [] (kmem_cache_alloc) from [] (__alloc_skb+0x60/0x174) > > [] (__alloc_skb) from [] (ath9k_wmi_cmd+0x50/0x184 > > [ath9k_htc]) > > [] (ath9k_wmi_cmd [ath9k_htc]) from [] > > (ath9k_regwrite_multi+0x54/0x84 [ath9k_htc]) > > [] (ath9k_regwrite_multi [ath9k_htc]) from [] > > (ath9k_regwrite+0xf0/0xfc [ath9k_htc]) > > [] (ath9k_regwrite [ath9k_htc]) from [] > > (ar5008_hw_process_ini+0x280/0x6c0 [ath9k_hw]) > > [] (ar5008_hw_process_ini [ath9k_hw]) from [] > > (ath9k_hw_reset+0x270/0x1458 [ath9k_hw]) > > [] (ath9k_hw_reset [ath9k_hw]) from [] > > (ath9k_htc_start+0xb0/0x22c [ath9k_htc]) > > [] (ath9k_htc_start [ath9k_htc]) from [] > > (drv_start+0x4c/0x1e8 [mac80211]) > > [] (drv_start [mac80211]) from [] > > (ieee80211_do_open+0x480/0x954 [mac80211]) > > [] (ieee80211_do_open [mac80211]) from [] > > (__dev_open+0xdc/0x160) > > [] (__dev_open) from [] (__dev_change_flags+0x1a4/0x204) > > [] (__dev_change_flags) from [] > > (dev_change_flags+0x20/0x50) > > [] (dev_change_flags) from [] (do_setlink+0x2ac/0x978) > > > > After applying this patch, the system is running in monitor mode without > > noticeable issues. > > > > Suggested-by: Michael Grzeschik > > Signed-off-by: Oleksij Rempel > > --- > > drivers/usb/musb/musb_dsps.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c > > index 88923175f71e..c01f9e9e69f5 100644 > > --- a/drivers/usb/musb/musb_dsps.c > > +++ b/drivers/usb/musb/musb_dsps.c > > @@ -690,7 +690,7 @@ static void dsps_dma_controller_resume(struct dsps_glue > > *glue) {} > > #endif /* CONFIG_USB_TI_CPPI41_DMA */ > > > > static struct musb_platform_ops dsps_ops = { > > - .quirks = MUSB_DMA_CPPI41 | MUSB_INDEXED_EP, > > + .quirks = MUSB_DMA_CPPI41 | MUSB_INDEXED_EP | MUSB_DA8XX, > > The MUSB_DA8XX flag cannot be simply applied to MUSB_DSPS, at least the > teardown and autoreq register offsets are different as show in > cppi41_dma_controller_create(). ok > Do you understand what exactly caused the issue? No. Disabling DMA support "solve" this issue as well. Beside, with DMA support, there remains one more crash with different symptoms. I can workaround it by disabling CPU Freq governor, or setting it to performance. > The kernel trace above doesn't provide enuough information. Do you have any suggestions how to instrument the kernel to get needed information? Or, should I try to capture USB traffic before the crash? If it helps, ath9k_htc is a usb wifi adapter. It generates a lot of USB traffic on multiple endpoints. Bulk with data packets and Interrupt with register accesses, LED blinking... etc. Regards, Oleksij -- Pengutronix e.K. | | Steuerwalder Str. 21 | http://www.pengutronix.de/ | 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | signature.asc Description: PGP signature
mmotm 2020-05-19-21-47 uploaded
The mm-of-the-moment snapshot 2020-05-19-21-47 has been uploaded to http://www.ozlabs.org/~akpm/mmotm/ mmotm-readme.txt says README for mm-of-the-moment: http://www.ozlabs.org/~akpm/mmotm/ This is a snapshot of my -mm patch queue. Uploaded at random hopefully more than once a week. You will need quilt to apply these patches to the latest Linus release (5.x or 5.x-rcY). The series file is in broken-out.tar.gz and is duplicated in http://ozlabs.org/~akpm/mmotm/series The file broken-out.tar.gz contains two datestamp files: .DATE and .DATE--mm-dd-hh-mm-ss. Both contain the string -mm-dd-hh-mm-ss, followed by the base kernel version against which this patch series is to be applied. This tree is partially included in linux-next. To see which patches are included in linux-next, consult the `series' file. Only the patches within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in linux-next. A full copy of the full kernel tree with the linux-next and mmotm patches already applied is available through git within an hour of the mmotm release. Individual mmotm releases are tagged. The master branch always points to the latest release, so it's constantly rebasing. https://github.com/hnaz/linux-mm The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second) contains daily snapshots of the -mm tree. It is updated more frequently than mmotm, and is untested. A git copy of this tree is also available at https://github.com/hnaz/linux-mm This mmotm tree contains the following patches against 5.7-rc6: (patches marked "*" will be included in linux-next) origin.patch * checkpatch-test-git_dir-changes.patch * proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch * proc-kpageflags-do-not-use-uninitialized-struct-pages.patch * kcov-cleanup-debug-messages.patch * kcov-fix-potential-use-after-free-in-kcov_remote_start.patch * kcov-move-t-kcov-assignments-into-kcov_start-stop.patch * kcov-move-t-kcov_sequence-assignment.patch * kcov-use-t-kcov_mode-as-enabled-indicator.patch * kcov-collect-coverage-from-interrupts.patch * usb-core-kcov-collect-coverage-from-usb-complete-callback.patch * memcg-optimize-memorynuma_stat-like-memorystat.patch * lib-lzo-fix-ambiguous-encoding-bug-in-lzo-rle.patch * device-dax-dont-leak-kernel-memory-to-user-space-after-unloading-kmem.patch * x86-bitops-fix-build-regression.patch * mm-compaction-avoid-vm_bug_onpageslab-in-page_mapcount.patch * rapidio-fix-an-error-in-get_user_pages_fast-error-handling.patch * selftests-vm-gitignore-add-mremap_dontunmap.patch * selftests-vm-write_to_hugetlbfsc-fix-unused-variable-warning.patch * kasan-disable-branch-tracing-for-core-runtime.patch * sh-include-linux-time_typesh-for-sockios.patch * maintainers-update-email-address-for-naoya-horiguchi.patch * scripts-support-compiled-source-improved-precise.patch * scripts-add-a-intermediate-file-for-make-gtags.patch * squashfs-migrate-from-ll_rw_block-usage-to-bio.patch * squashfs-migrate-from-ll_rw_block-usage-to-bio-fix.patch * ocfs2-add-missing-annotation-for-dlm_empty_lockres.patch * ocfs2-mount-shared-volume-without-ha-stack.patch * drivers-tty-serial-sh-scic-suppress-uninitialized-var-warning.patch * ramfs-support-o_tmpfile.patch * vfs-track-per-sb-writeback-errors-and-report-them-to-syncfs.patch * buffer-record-blockdev-write-errors-in-super_block-that-it-backs.patch * kernel-watchdog-flush-all-printk-nmi-buffers-when-hardlockup-detected.patch mm.patch * usercopy-mark-dma-kmalloc-caches-as-usercopy-caches.patch * mm-slub-fix-corrupted-freechain-in-deactivate_slab.patch * mm-slub-fix-corrupted-freechain-in-deactivate_slab-fix.patch * slub-remove-userspace-notifier-for-cache-add-remove.patch * slub-remove-kmalloc-under-list_lock-from-list_slab_objects.patch * mm-slub-fix-stack-overruns-with-slub_stats.patch * mm-slub-add-panic_on_error-to-the-debug-facilities-fix.patch * mm-dump_page-do-not-crash-with-invalid-mapping-pointer.patch * mm-move-readahead-prototypes-from-mmh.patch * mm-return-void-from-various-readahead-functions.patch * mm-ignore-return-value-of-readpages.patch * mm-move-readahead-nr_pages-check-into-read_pages.patch * mm-add-new-readahead_control-api.patch * mm-use-readahead_control-to-pass-arguments.patch * mm-rename-various-offset-parameters-to-index.patch * mm-rename-readahead-loop-variable-to-i.patch * mm-remove-page_offset-from-readahead-loop.patch * mm-put-readahead-pages-in-cache-earlier.patch * mm-add-readahead-address-space-operation.patch * mm-move-end_index-check-out-of-readahead-loop.patch * mm-add-page_cache_readahead_unbounded.patch * mm-document-why-we-dont-set-pagereadahead.patch * mm-use-memalloc_nofs_save-in-readahead-path.patch * fs-convert-mpage_readpages-to-mpage_readahead.patch * btrfs-convert-from-readpages-to-readahead.patch * erofs-convert-uncompressed-files-from-readpages-to-readahead.patch * erofs-convert-compressed-files-from-readpages-to-readahead.patch *
Re: [PATCH V2] powerpc/perf: Add support for outputting extended regs in perf intr_regs
On 5/19/20 11:45 AM, Athira Rajeev wrote: From: Anju T Sudhakar Add support for perf extended register capability in powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the PMU which support extended registers. The generic code define the mask of extended registers as 0 for non supported architectures. Patch adds extended regs support for power9 platform by exposing MMCR0, MMCR1 and MMCR2 registers. REG_RESERVED mask needs update to include extended regs. `PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers, is defined at runtime in the kernel based on platform since the supported registers may differ from one processor version to another and hence the MASK value. Perf tools side uses extended mask to display the platform supported register names (with -I? option) to the user and also send this mask to the kernel to capture the extended registers in each sample. Hence decide the mask value based on the processor version. with patch -- available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2 PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0 ... intr regs: mask 0x ABI 64-bit r00xc012b77c r10xc03fe5e03930 r20xc1b0e000 r30xc03fdcddf800 r40xc03fc788 r50x9c422724be r60xc03fe5e03908 r70xff63bddc8706 r80x9e4 r90x0 r10 0x1 r11 0x0 r12 0xc01299c0 r13 0xc03c4800 r14 0x0 r15 0x7fffdd8b8b00 r16 0x0 r17 0x7fffdd8be6b8 r18 0x7e7076607730 r19 0x2f r20 0xc0001fc26c68 r21 0xc0002041e4227e00 r22 0xc0002018fb60 r23 0x1 r24 0xc03ffec4d900 r25 0x8000 r26 0x0 r27 0x1 r28 0x1 r29 0xc1be1260 r30 0x6008010 r31 0xc03ffebb7218 nip 0xc012b910 msr 0x90009033 orig_r3 0xc012b86c ctr 0xc01299c0 link 0xc012b77c xer 0x0 ccr 0x2800 softe 0x1 trap 0xf00 dar 0x0 dsisr 0x800 sier 0x0 mmcra 0x800 mmcr0 0x82008090 mmcr1 0x1e00 mmcr2 0x0 ... thread: perf:4784 Signed-off-by: Anju T Sudhakar [Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ] Signed-off-by: Athira Rajeev --- Changes from v1 -> v2 - PERF_REG_EXTENDED_MASK` is defined at runtime in the kernel based on platform. This will give flexibility in using extended regs for all processor versions where the supported registers may differ. - removed PERF_REG_EXTENDED_MASK from the perf tools side. Based on the processor version(from PVR value), tool side will return the appropriate extended mask - Since tool changes can handle without a "PERF_REG_EXTENDED_MASK" macro, dropped patch to set NO_AUXTRACE. - Addressed review comments from Ravi Bangoria for V1 --- arch/powerpc/include/asm/perf_event_server.h| 8 arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++- arch/powerpc/perf/core-book3s.c | 1 + arch/powerpc/perf/perf_regs.c | 34 ++-- arch/powerpc/perf/power9-pmu.c | 6 +++ tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++- tools/perf/arch/powerpc/include/perf_regs.h | 5 ++- tools/perf/arch/powerpc/util/perf_regs.c| 54 + 8 files changed, 130 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 3e9703f..1458e1a 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -15,6 +15,9 @@ #define MAX_EVENT_ALTERNATIVES8 #define MAX_LIMITED_HWCOUNTERS2 +extern u64 mask_var; +#define PERF_REG_EXTENDED_MASK mask_var + struct perf_event; /* @@ -55,6 +58,11 @@ struct power_pmu { int *blacklist_ev; /* BHRB entries in the PMU */ int bhrb_nr; + /* +* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if +* the pmu supports extended perf regs capability +*/ + int capabilities; }; /* diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h index f599064..485b1d5 100644 --- a/arch/powerpc/include/uapi/asm/perf_regs.h +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -48,6 +48,18 @@ enum perf_event_powerpc_regs { PERF_REG_POWERPC_DSISR, PERF_REG_POWERPC_SIER, PERF_REG_POWERPC_MMCRA, - PERF_REG_POWERPC_MAX, + /* Extended registers */ +
Re: [PATCH v4 2/4] kasan: record and print the free track
On Wed, May 20, 2020 at 6:03 AM Walter Wu wrote: > > > On Tue, May 19, 2020 at 4:25 AM Walter Wu wrote: > > > > > > Move free track from slub alloc meta-data to slub free meta-data in > > > order to make struct kasan_free_meta size is 16 bytes. It is a good > > > size because it is the minimal redzone size and a good number of > > > alignment. > > > > > > For free track in generic KASAN, we do the modification in struct > > > kasan_alloc_meta and kasan_free_meta: > > > - remove free track from kasan_alloc_meta. > > > - add free track into kasan_free_meta. > > > > > > [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 > > > > > > Signed-off-by: Walter Wu > > > Suggested-by: Dmitry Vyukov > > > Cc: Andrey Ryabinin > > > Cc: Dmitry Vyukov > > > Cc: Alexander Potapenko > > > --- > > > mm/kasan/common.c | 22 ++ > > > mm/kasan/generic.c | 18 ++ > > > mm/kasan/kasan.h | 7 +++ > > > mm/kasan/report.c | 20 > > > mm/kasan/tags.c| 37 + > > > 5 files changed, 64 insertions(+), 40 deletions(-) > > > > > > diff --git a/mm/kasan/common.c b/mm/kasan/common.c > > > index 8bc618289bb1..47b53912f322 100644 > > > --- a/mm/kasan/common.c > > > +++ b/mm/kasan/common.c > > > @@ -51,7 +51,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags) > > > return stack_depot_save(entries, nr_entries, flags); > > > } > > > > > > -static inline void set_track(struct kasan_track *track, gfp_t flags) > > > +void kasan_set_track(struct kasan_track *track, gfp_t flags) > > > { > > > track->pid = current->pid; > > > track->stack = kasan_save_stack(flags); > > > @@ -299,24 +299,6 @@ struct kasan_free_meta *get_free_info(struct > > > kmem_cache *cache, > > > return (void *)object + cache->kasan_info.free_meta_offset; > > > } > > > > > > - > > > -static void kasan_set_free_info(struct kmem_cache *cache, > > > - void *object, u8 tag) > > > -{ > > > - struct kasan_alloc_meta *alloc_meta; > > > - u8 idx = 0; > > > - > > > - alloc_meta = get_alloc_info(cache, object); > > > - > > > -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY > > > - idx = alloc_meta->free_track_idx; > > > - alloc_meta->free_pointer_tag[idx] = tag; > > > - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; > > > -#endif > > > - > > > - set_track(_meta->free_track[idx], GFP_NOWAIT); > > > -} > > > - > > > void kasan_poison_slab(struct page *page) > > > { > > > unsigned long i; > > > @@ -492,7 +474,7 @@ static void *__kasan_kmalloc(struct kmem_cache > > > *cache, const void *object, > > > KASAN_KMALLOC_REDZONE); > > > > > > if (cache->flags & SLAB_KASAN) > > > - set_track(_alloc_info(cache, object)->alloc_track, > > > flags); > > > + kasan_set_track(_alloc_info(cache, > > > object)->alloc_track, flags); > > > > > > return set_tag(object, tag); > > > } > > > diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c > > > index 3372bdcaf92a..763d8a13e0ac 100644 > > > --- a/mm/kasan/generic.c > > > +++ b/mm/kasan/generic.c > > > @@ -344,3 +344,21 @@ void kasan_record_aux_stack(void *addr) > > > alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; > > > alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); > > > } > > > + > > > +void kasan_set_free_info(struct kmem_cache *cache, > > > + void *object, u8 tag) > > > +{ > > > + struct kasan_free_meta *free_meta; > > > + > > > + free_meta = get_free_info(cache, object); > > > + kasan_set_track(_meta->free_track, GFP_NOWAIT); > > > +} > > > + > > > +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, > > > + void *object, u8 tag) > > > +{ > > > + struct kasan_free_meta *free_meta; > > > + > > > + free_meta = get_free_info(cache, object); > > > + return _meta->free_track; > > > +} > > > diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h > > > index a7391bc83070..ad897ec36545 100644 > > > --- a/mm/kasan/kasan.h > > > +++ b/mm/kasan/kasan.h > > > @@ -127,6 +127,9 @@ struct kasan_free_meta { > > > * Otherwise it might be used for the allocator freelist. > > > */ > > > struct qlist_node quarantine_link; > > > +#ifdef CONFIG_KASAN_GENERIC > > > + struct kasan_track free_track; > > > +#endif > > > }; > > > > > > struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, > > > @@ -168,6 +171,10 @@ void kasan_report_invalid_free(void *object, > > > unsigned long ip); > > > struct page *kasan_addr_to_page(const void *addr); > > > > > > depot_stack_handle_t kasan_save_stack(gfp_t flags); > > > +void kasan_set_track(struct kasan_track *track, gfp_t flags); > > > +void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); > > > +struct kasan_track *kasan_get_free_track(struct kmem_cache
Re: [RFC][PATCH 1/5] thermal: Add support for /dev/thermal_notify
On Mon, May 4, 2020 at 11:47 PM Srinivas Pandruvada wrote: > > This change adds an optional feature to add a new device entry > /dev/thermal_notify. > > When config CONFIG_THERMAL_USER_EVENT_INTERFACE is selected, this new > device entry is created. > > Thermal core or any thermal driver can use thermal_dev_send_event() interface Do you have any particular use case in mind where a platform driver will use this interface to send platform-specific events? IMO, we should probably try to keep this restricted to messages from thermal core if we are to have any hope of having a standard library in userspace capable of parsing these thermal events. > to send events. Each user events follows a standard format: > - zone_id > - event_id > - event_data > - reserved for future, currently 0s > > User space can basically: > fd = open ("/dev/thermal_notify") > In a loop > read (fd) > read and process event > > or > fd = open ("/dev/thermal_notify") > Set the fs as non blocking > In a loop > Use poll() and wait > read and process event > > There are predefined events added to thermal.h. Based on need they can > be extended. > > Signed-off-by: Srinivas Pandruvada > --- > drivers/thermal/Kconfig | 9 ++ > drivers/thermal/Makefile | 3 + > drivers/thermal/thermal_dev_if.c | 195 +++ > include/linux/thermal.h | 24 > 4 files changed, 231 insertions(+) > create mode 100644 drivers/thermal/thermal_dev_if.c > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig > index 91af271e9bb0..27d05d62458e 100644 > --- a/drivers/thermal/Kconfig > +++ b/drivers/thermal/Kconfig > @@ -78,6 +78,15 @@ config THERMAL_WRITABLE_TRIPS > Say 'Y' here if you would like to allow userspace tools to > change trip temperatures. > > +config THERMAL_USER_EVENT_INTERFACE > + bool "Allow user space to read thermal events from a dev file" > + help > + This option allows a user space program to read thermal events > + via /dev/thermal_notify file. > + > + Say 'Y' here if you would like to allow userspace programs to > + read thermal events. > + > choice > prompt "Default Thermal governor" > default THERMAL_DEFAULT_GOV_STEP_WISE > diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile > index 8c8ed7b79915..8f65832d755a 100644 > --- a/drivers/thermal/Makefile > +++ b/drivers/thermal/Makefile > @@ -11,6 +11,9 @@ thermal_sys-y += thermal_core.o > thermal_sysfs.o \ > thermal_sys-$(CONFIG_THERMAL_HWMON)+= thermal_hwmon.o > thermal_sys-$(CONFIG_THERMAL_OF) += of-thermal.o > > +# Thermal user space events > +obj-$(CONFIG_THERMAL_USER_EVENT_INTERFACE) += thermal_dev_if.o > + > # governors > thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE) += fair_share.o > thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG)+= gov_bang_bang.o > diff --git a/drivers/thermal/thermal_dev_if.c > b/drivers/thermal/thermal_dev_if.c > new file mode 100644 > index ..763bfe9eef9d > --- /dev/null > +++ b/drivers/thermal/thermal_dev_if.c > @@ -0,0 +1,195 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Thermal device file interface > + * Copyright (c) 2020, Intel Corporation. > + * All rights reserved. > + * > + * Author: Srinivas Pandruvada > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#define THERMAL_DEV_FIFO_SIZE 1024 > + > +struct thermal_chdev_sample { > + int zone_id; > + int event; > + u64 event_data; > + u64 reserved; > +}; > + > +struct thermal_chdev { > + struct miscdevice therm_dev; > + struct kfifo data_fifo; > + unsigned long misc_opened; > + wait_queue_head_t wait; > +}; > + > +static DEFINE_MUTEX(thermal_chdev_mutex); > +static struct thermal_chdev *thermal_chdev; > + > +static int thermal_chdev_open(struct inode *inode, struct file *file) > +{ > + struct thermal_chdev *chdev; > + > + chdev = container_of(file->private_data, struct thermal_chdev, > therm_dev); > + > + /* We essentially have single reader and writer */ > + if (test_and_set_bit(0, >misc_opened)) > + return -EBUSY; > + > + return stream_open(inode, file); > +} > + > +static int thermal_chdev_release(struct inode *inode, struct file *file) > +{ > + struct thermal_chdev *chdev; > + > + chdev = container_of(file->private_data, struct thermal_chdev, > therm_dev); > + > + clear_bit(0, >misc_opened); > + > + return 0; > +} > + > +static __poll_t thermal_chdev_poll(struct file *file, struct > poll_table_struct *wait) > +{ > + struct thermal_chdev *chdev; > + __poll_t mask = 0; > + > + chdev = container_of(file->private_data, struct thermal_chdev, > therm_dev); > + > +
[PATCH] m68k/mac: Don't call via_flush_cache() on Mac IIfx
There is no VIA2 chip on the Mac IIfx, so don't call via_flush_cache(). This avoids a boot crash which appeared in v5.4. printk: console [ttyS0] enabled printk: bootconsole [debug0] disabled printk: bootconsole [debug0] disabled Calibrating delay loop... 9.61 BogoMIPS (lpj=48064) pid_max: default: 32768 minimum: 301 Mount-cache hash table entries: 1024 (order: 0, 4096 bytes, linear) Mountpoint-cache hash table entries: 1024 (order: 0, 4096 bytes, linear) devtmpfs: initialized random: get_random_u32 called from bucket_table_alloc.isra.27+0x68/0x194 with crng_init=0 clocksource: jiffies: mask: 0x max_cycles: 0x, max_idle_ns: 1911260446275 ns futex hash table entries: 256 (order: -1, 3072 bytes, linear) NET: Registered protocol family 16 Data read fault at 0x in Super Data (pc=0x8a6a) BAD KERNEL BUSERR Oops: Modules linked in: PC: [<8a6a>] via_flush_cache+0x12/0x2c SR: 2700 SP: 01c1fe3c a2: 01c24000 d0: 1119d1: 000cd2: 00012000d3: 000f d4: 01c06840d5: 00033b92a0: a1: Process swapper (pid: 1, task=01c24000) Frame format=B ssw=0755 isc=0200 isb=fff7 daddr= dobuf=01c1fed0 baddr=8a6e dibuf=004e ver=f Stack from 01c1fec4: 01c1fed0 7d7e 00010080 01c1fedc 792e 0001 01c1fef4 6b40 01c8 0004 0006 0003 01c1ff1c 004a545e 004ff200 0004 0003 01c06840 00033b92 004a5410 004b6c88 01c1ff84 21e2 0073 0003 01c06840 00033b92 0038507a 004bb094 004b6ca8 004b6c88 004b6ca4 004b6c88 21ae 00020002 01c0685d 01c1ffb4 0049f938 00409c85 01c06840 0045bd40 0073 0002 0002 Call Trace: [<7d7e>] mac_cache_card_flush+0x12/0x1c [<00010080>] fix_dnrm+0x2/0x18 [<792e>] cache_push+0x46/0x5a [<6b40>] arch_dma_prep_coherent+0x60/0x6e [<0004>] switched_to_dl+0x76/0xd0 [<004a545e>] dma_atomic_pool_init+0x4e/0x188 [<0004>] switched_to_dl+0x76/0xd0 [<00033b92>] parse_args+0x0/0x370 [<004a5410>] dma_atomic_pool_init+0x0/0x188 [<21e2>] do_one_initcall+0x34/0x1be [<00033b92>] parse_args+0x0/0x370 [<0038507a>] strcpy+0x0/0x1e [<21ae>] do_one_initcall+0x0/0x1be [<00020002>] do_proc_dointvec_conv+0x54/0x74 [<0049f938>] kernel_init_freeable+0x126/0x190 [<0049f94c>] kernel_init_freeable+0x13a/0x190 [<004a5410>] dma_atomic_pool_init+0x0/0x188 [<00041798>] complete+0x0/0x3c [<000b9b0c>] kfree+0x0/0x20a [<0038df98>] schedule+0x0/0xd0 [<0038d604>] kernel_init+0x0/0xda [<0038d610>] kernel_init+0xc/0xda [<0038d604>] kernel_init+0x0/0xda [<2d38>] ret_from_kernel_thread+0xc/0x14 Code: 2079 0048 10da 2279 0048 10c8 d3c8 <1011> 0200 fff7 1280 d1f9 0048 10c8 1010 0008 1080 4e5e 4e75 4e56 2039 Disabling lock debugging due to kernel taint Kernel panic - not syncing: Attempted to kill init! exitcode=0x000b Thanks to Stan Johnson for capturing the console log and running git bisect. Git bisect said commit 8e3a68fb55e0 ("dma-mapping: make dma_atomic_pool_init self-contained") is the first "bad" commit. I don't know why. Perhaps mach_l2_flush first became reachable with that commit. Cc: Joshua Thompson Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/m68k/include/asm/mac_via.h | 1 + arch/m68k/mac/config.c | 21 ++--- arch/m68k/mac/via.c | 6 +- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h index de1470c4d829..1149251ea58d 100644 --- a/arch/m68k/include/asm/mac_via.h +++ b/arch/m68k/include/asm/mac_via.h @@ -257,6 +257,7 @@ extern int rbv_present,via_alt_mapping; struct irq_desc; +extern void via_l2_flush(int writeback); extern void via_register_interrupts(void); extern void via_irq_enable(int); extern void via_irq_disable(int); diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 611f73bfc87c..d0126ab01360 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -59,7 +59,6 @@ extern void iop_preinit(void); extern void iop_init(void); extern void via_init(void); extern void via_init_clock(irq_handler_t func); -extern void via_flush_cache(void); extern void oss_init(void); extern void psc_init(void); extern void baboon_init(void); @@ -130,21 +129,6 @@ int __init mac_parse_bootinfo(const struct bi_record *record) return unknown; } -/* - * Flip into 24bit mode for an instant - flushes the L2 cache card. We - * have to disable interrupts for this. Our IRQ handlers will crap - * themselves if they take an IRQ in 24bit mode! - */ - -static void mac_cache_card_flush(int writeback) -{ - unsigned long flags; - - local_irq_save(flags); - via_flush_cache(); - local_irq_restore(flags); -} - void __init config_mac(void) { if (!MACH_IS_MAC) @@ -175,9 +159,8 @@ void __init
Re: [RFC PATCH 2/2] init: Allow multi-line output of kernel command line
On (20/05/19 12:42), Joe Perches wrote: > +static void __init print_cmdline(char *line) > +{ > +#ifdef CONFIG_PRINTK > + const char *prefix = "Kernel command line"; > + size_t len = strlen(line); > + > + while (len > PRINTK_LOG_LINE_MAX) { > + char *pos = line; > + char *last_pos = pos + PRINTK_LOG_LINE_MAX - 1; > + char saved_char; > + /* Find last space char within the maximum line length */ > + while ((pos = memchr(pos, ' ', len - (pos - line))) && > +(pos - line) < PRINTK_LOG_LINE_MAX - 1) { Don't you need to also count in the 'prefix' length? > + last_pos = pos; > + } > + saved_char = line[last_pos - line]; > + line[last_pos - line] = 0; > + pr_notice("%s: %s\n", prefix, line); > + prefix = "Kernel command line (continued)"; > + line[last_pos - line] = saved_char; > + len -= pos - line; > + line += pos - line; > + } > + > + pr_notice("%s: %s\n", prefix, line); > +#endif > +} I like this in general. And I agree that we better handle this externally, on the printk() caller side, so that printk() will still have sane limits and won't print a 1G string for example. I wonder if we need to export PRINTK_LOG_LINE_MAX. Maybe we can use here something rather random and much shorter instead. E.g. 256 chars. Hmm. How many crash/monitoring tools can get confused by multiple "Kernel command line" prefixes? -ss
[PATCH] m68k/mac: Remove misleading comment
This code path was tested on a Quadra 950 a long time ago and the comment isn't needed. Cc: Joshua Thompson Signed-off-by: Finn Thain --- arch/m68k/mac/iop.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 100e5112fd9e..d99c7ea08d8c 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -299,7 +299,6 @@ void __init iop_init(void) /* * Register the interrupt handler for the IOPs. - * TODO: might be wrong for non-OSS machines. Anyone? */ void __init iop_register_interrupts(void) -- 2.26.2
[PATCH] m68k/mac: Avoid stuck ISM IOP interrupt on Quadra 900/950
On a Quadra 900/950, the ISM IOP IRQ output pin is connected to an edge-triggered input on VIA2. It is theoretically possible that this signal could fail to produce the expected VIA2 interrupt. The two IOP interrupt flags can be asserted in any order but the logic in iop_ism_irq() does not allow for that. In particular, INT0 can be asserted right after INT0 is checked and before INT1 is cleared. Such an interrupt would produce no new edge and VIA2 would detect no further interrupts from the IOP. Avoid this by looping over the INT0/1 handlers so an edge can be produced. Cc: Joshua Thompson Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/m68k/mac/iop.c | 50 + 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 9bfa17015768..100e5112fd9e 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -566,36 +566,42 @@ irqreturn_t iop_ism_irq(int irq, void *dev_id) uint iop_num = (uint) dev_id; volatile struct mac_iop *iop = iop_base[iop_num]; int i,state; + u8 events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); iop_pr_debug("status %02X\n", iop->status_ctrl); - /* INT0 indicates a state change on an outgoing message channel */ - - if (iop->status_ctrl & IOP_INT0) { - iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, send states", iop->status_ctrl); - for (i = 0 ; i < NUM_IOP_CHAN ; i++) { - state = iop_readb(iop, IOP_ADDR_SEND_STATE + i); - iop_pr_cont(" %02X", state); - if (state == IOP_MSG_COMPLETE) { - iop_handle_send(iop_num, i); + do { + /* INT0 indicates state change on an outgoing message channel */ + if (events & IOP_INT0) { + iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC; + iop_pr_debug("new status %02X, send states", +iop->status_ctrl); + for (i = 0; i < NUM_IOP_CHAN; i++) { + state = iop_readb(iop, IOP_ADDR_SEND_STATE + i); + iop_pr_cont(" %02X", state); + if (state == IOP_MSG_COMPLETE) + iop_handle_send(iop_num, i); } + iop_pr_cont("\n"); } - iop_pr_cont("\n"); - } - if (iop->status_ctrl & IOP_INT1) { /* INT1 for incoming msgs */ - iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, recv states", iop->status_ctrl); - for (i = 0 ; i < NUM_IOP_CHAN ; i++) { - state = iop_readb(iop, IOP_ADDR_RECV_STATE + i); - iop_pr_cont(" %02X", state); - if (state == IOP_MSG_NEW) { - iop_handle_recv(iop_num, i); + /* INT1 for incoming messages */ + if (events & IOP_INT1) { + iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC; + iop_pr_debug("new status %02X, recv states", +iop->status_ctrl); + for (i = 0; i < NUM_IOP_CHAN; i++) { + state = iop_readb(iop, IOP_ADDR_RECV_STATE + i); + iop_pr_cont(" %02X", state); + if (state == IOP_MSG_NEW) + iop_handle_recv(iop_num, i); } + iop_pr_cont("\n"); } - iop_pr_cont("\n"); - } + + events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); + } while (events); + return IRQ_HANDLED; } -- 2.26.2
Re: [RFC][PATCH 4/5] thermal: Add support for setting polling interval
On Mon, May 4, 2020 at 11:47 PM Srinivas Pandruvada wrote: > > Add new attribute in the thermal syfs for setting temperature sampling > interval when CONFIG_THERMAL_USER_EVENT_INTERFACE is defined. The default > value is 0, which means no polling. > > At this interval user space will get an event THERMAL_TEMP_SAMPLE with > temperature sample. This reuses existing polling mecahnism when polling > or passive delay is specified during zone registry. To avoid interference > with passive and polling delay, this new polling attribute can't be used > for those zones. Why should the kernel periodically emit events for userspace when the userspace is perfectly capable of deciding how frequently it wants to poll a file for changes? > > Signed-off-by: Srinivas Pandruvada > --- > drivers/thermal/thermal_core.c | 7 +++ > drivers/thermal/thermal_sysfs.c | 36 +++-- > include/linux/thermal.h | 1 + > 3 files changed, 42 insertions(+), 2 deletions(-) > > diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c > index 14770d882d42..17cd799b0073 100644 > --- a/drivers/thermal/thermal_core.c > +++ b/drivers/thermal/thermal_core.c > @@ -313,6 +313,8 @@ static void monitor_thermal_zone(struct > thermal_zone_device *tz) > thermal_zone_device_set_polling(tz, tz->passive_delay); > else if (tz->polling_delay) > thermal_zone_device_set_polling(tz, tz->polling_delay); > + else if (tz->temp_polling_delay) > + thermal_zone_device_set_polling(tz, tz->temp_polling_delay); > else > thermal_zone_device_set_polling(tz, 0); > > @@ -446,6 +448,11 @@ static void update_temperature(struct > thermal_zone_device *tz) > tz->temperature = temp; > mutex_unlock(>lock); > > + if (tz->temp_polling_delay) { > + thermal_dev_send_event(tz->id, THERMAL_TEMP_SAMPLE, temp); > + monitor_thermal_zone(tz); > + } > + > trace_thermal_temperature(tz); > if (tz->last_temperature == THERMAL_TEMP_INVALID) > dev_dbg(>device, "last_temperature N/A, > current_temperature=%d\n", > diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c > index aa85424c3ac4..0df7997993fe 100644 > --- a/drivers/thermal/thermal_sysfs.c > +++ b/drivers/thermal/thermal_sysfs.c > @@ -248,6 +248,36 @@ create_thres_attr(temp_thres_low); > create_thres_attr(temp_thres_high); > create_thres_attr(temp_thres_hyst); > > +static ssize_t > +temp_polling_delay_store(struct device *dev, struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + struct thermal_zone_device *tz = to_thermal_zone(dev); > + int val; > + > + if (kstrtoint(buf, 10, )) > + return -EINVAL; > + > + if (val && val < 1000) > + return -EINVAL; > + > + tz->temp_polling_delay = val; > + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); > + > + return count; > +} > + > +static ssize_t > +temp_polling_delay_show(struct device *dev, struct device_attribute *attr, > +char *buf) > +{ > + struct thermal_zone_device *tz = to_thermal_zone(dev); > + > + return sprintf(buf, "%d\n", tz->temp_polling_delay); > +} > + > +static DEVICE_ATTR_RW(temp_polling_delay); > + > static int create_user_events_attrs(struct thermal_zone_device *tz) > { > struct attribute **attrs; > @@ -260,8 +290,8 @@ static int create_user_events_attrs(struct > thermal_zone_device *tz) > if (tz->ops->get_temp_thres_high) > ++index; > > - /* One additional space for NULL */ > - attrs = kcalloc(index + 1, sizeof(*attrs), GFP_KERNEL); > + /* One additional space for NULL and temp_pollling_delay */ > + attrs = kcalloc(index + 2, sizeof(*attrs), GFP_KERNEL); > if (!attrs) > return -ENOMEM; > > @@ -312,6 +342,8 @@ static int create_user_events_attrs(struct > thermal_zone_device *tz) > attrs[index] = >threshold_attrs[index].attr.attr; > ++index; > } > + if (!tz->polling_delay && !tz->passive_delay) > + attrs[index++] = _attr_temp_polling_delay.attr; > attrs[index] = NULL; > tz->threshold_attribute_group.attrs = attrs; > > diff --git a/include/linux/thermal.h b/include/linux/thermal.h > index ee9d79ace7ce..0ec4bd8c9c5c 100644 > --- a/include/linux/thermal.h > +++ b/include/linux/thermal.h > @@ -216,6 +216,7 @@ struct thermal_zone_device { > enum thermal_notify_event notify_event; > struct attribute_group threshold_attribute_group; > struct thermal_attr *threshold_attrs; > + int temp_polling_delay; > }; > > /** > -- > 2.25.4 >
Re: [PATCHv2 4/5] Input: EXC3000: Add support to query model and fw_version
Hi Sebastian, I love your patch! Yet something to improve: [auto build test ERROR on input/next] [also build test ERROR on v5.7-rc6 next-20200519] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Sebastian-Reichel/EXC3000-Updates/20200520-023207 base: https://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git next config: ia64-randconfig-r023-20200519 (attached as .config) compiler: ia64-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=ia64 If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot All error/warnings (new ones prefixed by >>, old ones prefixed by <<): In file included from include/linux/kobject.h:20, from include/linux/device.h:17, from drivers/input/touchscreen/exc3000.c:11: >> drivers/input/touchscreen/exc3000.c:252:23: error: initialization of >> 'ssize_t (*)(struct device *, struct device_attribute *, char *)' {aka 'long >> int (*)(struct device *, struct device_attribute *, char *)'} from >> incompatible pointer type 'int (*)(struct device *, struct device_attribute >> *, char *)' [-Werror=incompatible-pointer-types] 252 | static DEVICE_ATTR_RO(fw_version); | ^~ include/linux/sysfs.h:117:10: note: in definition of macro '__ATTR_RO' 117 | .show = _name##_show, | ^ >> drivers/input/touchscreen/exc3000.c:252:8: note: in expansion of macro >> 'DEVICE_ATTR_RO' 252 | static DEVICE_ATTR_RO(fw_version); |^~ drivers/input/touchscreen/exc3000.c:252:23: note: (near initialization for 'dev_attr_fw_version.show') 252 | static DEVICE_ATTR_RO(fw_version); | ^~ include/linux/sysfs.h:117:10: note: in definition of macro '__ATTR_RO' 117 | .show = _name##_show, | ^ >> drivers/input/touchscreen/exc3000.c:252:8: note: in expansion of macro >> 'DEVICE_ATTR_RO' 252 | static DEVICE_ATTR_RO(fw_version); |^~ cc1: some warnings being treated as errors vim +252 drivers/input/touchscreen/exc3000.c 222 223 static int fw_version_show(struct device *dev, 224 struct device_attribute *attr, char *buf) 225 { 226 struct exc3000_data *data = dev_get_drvdata(dev); 227 static const u8 request[68] = { 228 0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'D', 0x00 229 }; 230 struct i2c_client *client = data->client; 231 int err; 232 233 mutex_lock(>query_lock); 234 235 data->query_result = -ETIMEDOUT; 236 reinit_completion(>wait_event); 237 238 err = i2c_master_send(client, request, sizeof(request)); 239 if (err < 0) { 240 mutex_unlock(>query_lock); 241 return err; 242 } 243 244 wait_for_completion_interruptible_timeout(>wait_event, 1*HZ); 245 mutex_unlock(>query_lock); 246 247 if (data->query_result < 0) 248 return data->query_result; 249 250 return sprintf(buf, "%s\n", data->fw_version); 251 } > 252 static DEVICE_ATTR_RO(fw_version); 253 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH v1] driver core: Fix handling of SYNC_STATE_ONLY + STATELESS device links
On Tue, May 19, 2020 at 9:36 PM Saravana Kannan wrote: > > Commit 21c27f06587d ("driver core: Fix SYNC_STATE_ONLY device link > implementation") didn't completely fix STATELESS + SYNC_STATE_ONLY > handling. > > What looks like an optimization in that commit is actually a bug that > causes an if condition to always take the else path. This prevents > reordering of devices in the dpm_list when a DL_FLAG_STATELESS device > link is create on top of an existing DL_FLAG_SYNC_STATE_ONLY device > link. > > Fixes: 21c27f06587d ("driver core: Fix SYNC_STATE_ONLY device link > implementation") > Signed-off-by: Saravana Kannan > --- > Sigh... device links are tricky and hard! Sorry about the endless fixes :( > Also, how was this not caught by the compiler as a warning? > > -Saravana > > drivers/base/core.c | 8 +--- > 1 file changed, 5 insertions(+), 3 deletions(-) > > diff --git a/drivers/base/core.c b/drivers/base/core.c > index 83a3e0b62ce3..dfd4e94ef790 100644 > --- a/drivers/base/core.c > +++ b/drivers/base/core.c > @@ -543,12 +543,14 @@ struct device_link *device_link_add(struct device > *consumer, > > if (flags & DL_FLAG_STATELESS) { > kref_get(>kref); > - link->flags |= DL_FLAG_STATELESS; > if (link->flags & DL_FLAG_SYNC_STATE_ONLY && > - !(link->flags & DL_FLAG_STATELESS)) > + !(link->flags & DL_FLAG_STATELESS)) { > + link->flags |= DL_FLAG_STATELESS; > goto reorder; > - else > + } else { > + link->flags |= DL_FLAG_STATELESS; > goto out; > + } > } > > /* Forgot to add sta...@vger.kernel.org. Doing that now. -Saravana
Re: [PATCH v1 01/25] net: core: device_rename: Use rwsem instead of a seqcount
On Tue, 19 May 2020 20:18:19 -0700 Eric Dumazet wrote: > On 5/19/20 7:57 PM, David Miller wrote: > > From: Thomas Gleixner > > Date: Wed, 20 May 2020 01:42:30 +0200 > > > >> Stephen Hemminger writes: > >>> On Wed, 20 May 2020 00:23:48 +0200 > >>> Thomas Gleixner wrote: > No. We did not. -ENOTESTCASE > >>> > >>> Please try, it isn't that hard.. > >>> > >>> # time for ((i=0;i<1000;i++)); do ip li add dev dummy$i type dummy; done > >>> > >>> real 0m17.002s > >>> user 0m1.064s > >>> sys 0m0.375s > >> > >> And that solves the incorrectness of the current code in which way? > > > > You mentioned that there wasn't a test case, he gave you one to try. > > > > I do not think this would ever use device rename, nor netdev_get_name() > > None of this stuff is fast path really. > > # time for ((i=1;i<1000;i++)); do ip li add dev dummy$i type dummy; done > > real 0m1.127s > user 0m0.270s > sys 0m1.039s Your right it is a weak test, and most of the overhead is in the syscall and all netlink events that happen. It does end up looking up the new name, so would exercise that. Better test is to use %d syntax or create 1000 dummy's then rename every one. This is more of a stress test # for ((i=0;i<1000;i++)); do echo link add dev dummy%d type dummy; done | time ip -batch - 0.00user 0.29system 0:02.11elapsed 13%CPU (0avgtext+0avgdata 2544maxresident)k 0inputs+0outputs (0major+148minor)pagefaults 0swaps # for ((i=999;i>=0;i--)); do echo link set dummy$i name dummy$((i+1)); done | time ip -batch - 0.00user 0.26system 0:54.98elapsed 0%CPU (0avgtext+0avgdata 2508maxresident)k 0inputs+0outputs (0major+145minor)pagefaults 0swaps
[PATCH v1] driver core: Fix handling of SYNC_STATE_ONLY + STATELESS device links
Commit 21c27f06587d ("driver core: Fix SYNC_STATE_ONLY device link implementation") didn't completely fix STATELESS + SYNC_STATE_ONLY handling. What looks like an optimization in that commit is actually a bug that causes an if condition to always take the else path. This prevents reordering of devices in the dpm_list when a DL_FLAG_STATELESS device link is create on top of an existing DL_FLAG_SYNC_STATE_ONLY device link. Fixes: 21c27f06587d ("driver core: Fix SYNC_STATE_ONLY device link implementation") Signed-off-by: Saravana Kannan --- Sigh... device links are tricky and hard! Sorry about the endless fixes :( Also, how was this not caught by the compiler as a warning? -Saravana drivers/base/core.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 83a3e0b62ce3..dfd4e94ef790 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -543,12 +543,14 @@ struct device_link *device_link_add(struct device *consumer, if (flags & DL_FLAG_STATELESS) { kref_get(>kref); - link->flags |= DL_FLAG_STATELESS; if (link->flags & DL_FLAG_SYNC_STATE_ONLY && - !(link->flags & DL_FLAG_STATELESS)) + !(link->flags & DL_FLAG_STATELESS)) { + link->flags |= DL_FLAG_STATELESS; goto reorder; - else + } else { + link->flags |= DL_FLAG_STATELESS; goto out; + } } /* -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v6 1/5] perf stat: Fix wrong per-thread runtime stat for interval mode
root@kbl-ppc:~# perf stat --per-thread -e cycles,instructions -I1000 --interval-count 2 1.004171683 perf-3696 8,747,311 cycles ... 1.004171683 perf-3696691,730 instructions #0.08 insn per cycle ... 2.006490373 perf-3696 1,749,936 cycles ... 2.006490373 perf-3696 1,484,582 instructions #0.28 insn per cycle ... Let's see interval 2.006490373 perf-3696 1,749,936 cycles perf-3696 1,484,582 instructions #0.28 insn per cycle insn per cycle = 1,484,582 / 1,749,936 = 0.85. But now it's 0.28, that's not correct. stat_config.stats[] records the per-thread runtime stat. But for interval mode, it should be reset for each interval. So now, with this patch, root@kbl-ppc:~# perf stat --per-thread -e cycles,instructions -I1000 --interval-count 2 1.005818121 perf-8633 9,898,045 cycles ... 1.005818121 perf-8633693,298 instructions #0.07 insn per cycle ... 2.007863743 perf-8633 1,551,619 cycles ... 2.007863743 perf-8633 1,317,514 instructions #0.85 insn per cycle ... Let's check interval 2.007863743. insn per cycle = 1,317,514 / 1,551,619 = 0.85. It's correct. This patch creates runtime_stat_reset, places it next to untime_stat_new/runtime_stat_delete and moves all runtime_stat functions before process_interval. v4: --- Create runtime_stat_reset. Fixes: commit 14e72a21c783 ("perf stat: Update or print per-thread stats") Signed-off-by: Jin Yao --- tools/perf/builtin-stat.c | 70 +++ 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e0c1ad23c768..f3b3a59ac7d2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -351,6 +351,46 @@ static void read_counters(struct timespec *rs) } } +static int runtime_stat_new(struct perf_stat_config *config, int nthreads) +{ + int i; + + config->stats = calloc(nthreads, sizeof(struct runtime_stat)); + if (!config->stats) + return -1; + + config->stats_num = nthreads; + + for (i = 0; i < nthreads; i++) + runtime_stat__init(>stats[i]); + + return 0; +} + +static void runtime_stat_delete(struct perf_stat_config *config) +{ + int i; + + if (!config->stats) + return; + + for (i = 0; i < config->stats_num; i++) + runtime_stat__exit(>stats[i]); + + zfree(>stats); +} + +static void runtime_stat_reset(struct perf_stat_config *config) +{ + int i; + + if (!config->stats) + return; + + for (i = 0; i < config->stats_num; i++) + perf_stat__reset_shadow_per_stat(>stats[i]); +} + static void process_interval(void) { struct timespec ts, rs; @@ -359,6 +399,7 @@ static void process_interval(void) diff_timespec(, , _time); perf_stat__reset_shadow_per_stat(_stat); + runtime_stat_reset(_config); read_counters(); if (STAT_RECORD) { @@ -1737,35 +1778,6 @@ int process_cpu_map_event(struct perf_session *session, return set_maps(st); } -static int runtime_stat_new(struct perf_stat_config *config, int nthreads) -{ - int i; - - config->stats = calloc(nthreads, sizeof(struct runtime_stat)); - if (!config->stats) - return -1; - - config->stats_num = nthreads; - - for (i = 0; i < nthreads; i++) - runtime_stat__init(>stats[i]); - - return 0; -} - -static void runtime_stat_delete(struct perf_stat_config *config) -{ - int i; - - if (!config->stats) - return; - - for (i = 0; i < config->stats_num; i++) - runtime_stat__exit(>stats[i]); - - zfree(>stats); -} - static const char * const stat_report_usage[] = { "perf stat report []", NULL, -- 2.17.1
[PATCH v6 0/5] perf stat: Support overall statistics for interval mode
Currently perf-stat supports to print counts at regular interval (-I), but it's not very easy for user to get the overall statistics. With this patchset, it supports to report the summary at the end of interval output. For example, root@kbl-ppc:~# perf stat -e cycles -I1000 --interval-count 2 # time counts unit events 1.000412064 2,281,114 cycles 2.001383658 2,547,880 cycles Performance counter stats for 'system wide': 4,828,994 cycles 2.002860349 seconds time elapsed root@kbl-ppc:~# perf stat -e cycles,instructions -I1000 --interval-count 2 # time counts unit events 1.000389902 1,536,093 cycles 1.000389902420,226 instructions #0.27 insn per cycle 2.001433453 2,213,952 cycles 2.001433453735,465 instructions #0.33 insn per cycle Performance counter stats for 'system wide': 3,750,045 cycles 1,155,691 instructions #0.31 insn per cycle 2.003023361 seconds time elapsed root@kbl-ppc:~# perf stat -M CPI,IPC -I1000 --interval-count 2 # time counts unit events 1.000435121905,303 inst_retired.any # 2.9 CPI 1.000435121 2,663,333 cycles 1.000435121914,702 inst_retired.any # 0.3 IPC 1.000435121 2,676,559 cpu_clk_unhalted.thread 2.001615941 1,951,092 inst_retired.any # 1.8 CPI 2.001615941 3,551,357 cycles 2.001615941 1,950,837 inst_retired.any # 0.5 IPC 2.001615941 3,551,044 cpu_clk_unhalted.thread Performance counter stats for 'system wide': 2,856,395 inst_retired.any # 2.2 CPI 6,214,690 cycles 2,865,539 inst_retired.any # 0.5 IPC 6,227,603 cpu_clk_unhalted.thread 2.003403078 seconds time elapsed v6: --- 1. Add comments in perf_evlist__save_aggr_prev_raw_counts. 2. Move init_stats(_nsecs_stats) under interval condition check. Following patches are changed in v6. perf stat: Save aggr value to first member of prev_raw_counts perf stat: Report summary for interval mode v5: --- 1. Create new patch "perf stat: Save aggr value to first member of prev_raw_counts". 2. Call perf_evlist__save_aggr_prev_raw_counts to save aggr value to first member of prev_raw_counts for AGGR_GLOBAL. Then next, perf_stat_process_counter can create aggr values from per cpu values. Following patches are impacted in v5: perf stat: Copy counts from prev_raw_counts to evsel->counts perf stat: Save aggr value to first member of prev_raw_counts perf stat: Report summary for interval mode v4: --- 1. Create runtime_stat_reset. 2. Zero the aggr in perf_counts__reset and use it to reset prev_raw_counts. 3. Move affinity setup and read_counter_cpu to a new function read_affinity_counters. It's only called when stat_config.summary is not set. v3: --- 1. 'perf stat: Fix wrong per-thread runtime stat for interval mode' is a new patch which fixes an existing issue found in test. 2. We use the prev_raw_counts for summary counts. Drop the summary_counts in v2. 3. Fix some issues. v2: --- Rebase to perf/core branch Jin Yao (5): perf stat: Fix wrong per-thread runtime stat for interval mode perf counts: Reset prev_raw_counts counts perf stat: Copy counts from prev_raw_counts to evsel->counts perf stat: Save aggr value to first member of prev_raw_counts perf stat: Report summary for interval mode tools/perf/builtin-stat.c | 101 +- tools/perf/util/counts.c | 4 +- tools/perf/util/counts.h | 1 + tools/perf/util/stat.c| 53 +--- tools/perf/util/stat.h| 3 ++ 5 files changed, 122 insertions(+), 40 deletions(-) -- 2.17.1
[PATCH v6 5/5] perf stat: Report summary for interval mode
Currently perf-stat supports to print counts at regular interval (-I), but it's not very easy for user to get the overall statistics. The patch uses 'evsel->prev_raw_counts' to get counts for summary. Copy the counts to 'evsel->counts' after printing the interval results. Next, we just follow the non-interval processing. Let's see some examples, root@kbl-ppc:~# perf stat -e cycles -I1000 --interval-count 2 # time counts unit events 1.000412064 2,281,114 cycles 2.001383658 2,547,880 cycles Performance counter stats for 'system wide': 4,828,994 cycles 2.002860349 seconds time elapsed root@kbl-ppc:~# perf stat -e cycles,instructions -I1000 --interval-count 2 # time counts unit events 1.000389902 1,536,093 cycles 1.000389902420,226 instructions #0.27 insn per cycle 2.001433453 2,213,952 cycles 2.001433453735,465 instructions #0.33 insn per cycle Performance counter stats for 'system wide': 3,750,045 cycles 1,155,691 instructions #0.31 insn per cycle 2.003023361 seconds time elapsed root@kbl-ppc:~# perf stat -M CPI,IPC -I1000 --interval-count 2 # time counts unit events 1.000435121905,303 inst_retired.any # 2.9 CPI 1.000435121 2,663,333 cycles 1.000435121914,702 inst_retired.any # 0.3 IPC 1.000435121 2,676,559 cpu_clk_unhalted.thread 2.001615941 1,951,092 inst_retired.any # 1.8 CPI 2.001615941 3,551,357 cycles 2.001615941 1,950,837 inst_retired.any # 0.5 IPC 2.001615941 3,551,044 cpu_clk_unhalted.thread Performance counter stats for 'system wide': 2,856,395 inst_retired.any # 2.2 CPI 6,214,690 cycles 2,865,539 inst_retired.any # 0.5 IPC 6,227,603 cpu_clk_unhalted.thread 2.003403078 seconds time elapsed v6: --- Move init_stats(_nsecs_stats) under interval condition check. walltime_nsecs_stats.val holds the last value so we just need to init the other fields of stats. v5: --- Call perf_evlist__save_aggr_prev_raw_counts to save aggr value to first member of prev_raw_counts for AGGR_GLOBAL. Then next, perf_stat_process_counter can create aggr values from per cpu values. v4: --- Move affinity setup and read_counter_cpu to a new function read_affinity_counters. It's only called when stat_config.summary is not set. v3: --- Use evsel->prev_raw_counts for summary counts v2: --- Rebase to perf/core branch Signed-off-by: Jin Yao --- tools/perf/builtin-stat.c | 31 +++ tools/perf/util/stat.c| 2 +- tools/perf/util/stat.h| 1 + 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f3b3a59ac7d2..2486c79f0f34 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -314,14 +314,14 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu) return 0; } -static void read_counters(struct timespec *rs) +static int read_affinity_counters(struct timespec *rs) { struct evsel *counter; struct affinity affinity; int i, ncpus, cpu; if (affinity__setup() < 0) - return; + return -1; ncpus = perf_cpu_map__nr(evsel_list->core.all_cpus); if (!target__has_cpu() || target__has_per_thread()) @@ -341,6 +341,15 @@ static void read_counters(struct timespec *rs) } } affinity__cleanup(); + return 0; +} + +static void read_counters(struct timespec *rs) +{ + struct evsel *counter; + + if (!stat_config.summary && (read_affinity_counters(rs) < 0)) + return; evlist__for_each_entry(evsel_list, counter) { if (counter->err) @@ -763,7 +772,21 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (stat_config.walltime_run_table) stat_config.walltime_run[run_idx] = t1 - t0; - update_stats(_nsecs_stats, t1 - t0); + if (interval) { + stat_config.interval = 0; + stat_config.summary = true; + init_stats(_nsecs_stats); + update_stats(_nsecs_stats, t1 - t0); + + if (stat_config.aggr_mode == AGGR_GLOBAL) + perf_evlist__save_aggr_prev_raw_counts(evsel_list); + + perf_evlist__copy_prev_raw_counts(evsel_list); + perf_evlist__reset_prev_raw_counts(evsel_list); +
[PATCH v6 4/5] perf stat: Save aggr value to first member of prev_raw_counts
To collect the overall statistics for interval mode, we copy the counts from evsel->prev_raw_counts to evsel->counts. For AGGR_GLOBAL mode, because the perf_stat_process_counter creates aggr values from per cpu values, but the per cpu values are 0, so the calculated aggr values will be always 0. This patch uses a trick that saves the previous aggr value to the first member of perf_counts, then aggr calculation in process_counter_values can work correctly for AGGR_GLOBAL. v6: --- Add comments in perf_evlist__save_aggr_prev_raw_counts. Signed-off-by: Jin Yao --- tools/perf/util/stat.c | 20 tools/perf/util/stat.h | 1 + 2 files changed, 21 insertions(+) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index aadc723ce871..d23109c9bee9 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -249,6 +249,26 @@ void perf_evlist__copy_prev_raw_counts(struct evlist *evlist) perf_evsel__copy_prev_raw_counts(evsel); } +void perf_evlist__save_aggr_prev_raw_counts(struct evlist *evlist) +{ + struct evsel *evsel; + + /* +* To collect the overall statistics for interval mode, +* we copy the counts from evsel->prev_raw_counts to +* evsel->counts. The perf_stat_process_counter creates +* aggr values from per cpu values, but the per cpu values +* are 0 for AGGR_GLOBAL. So we use a trick that saves the +* previous aggr value to the first member of perf_counts, +* then aggr calculation in process_counter_values can work +* correctly. +*/ + evlist__for_each_entry(evlist, evsel) { + *perf_counts(evsel->prev_raw_counts, 0, 0) = + evsel->prev_raw_counts->aggr; + } +} + static void zero_per_pkg(struct evsel *counter) { if (counter->per_pkg_mask) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 62cf72c71869..18ead55756cc 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -199,6 +199,7 @@ void perf_evlist__free_stats(struct evlist *evlist); void perf_evlist__reset_stats(struct evlist *evlist); void perf_evlist__reset_prev_raw_counts(struct evlist *evlist); void perf_evlist__copy_prev_raw_counts(struct evlist *evlist); +void perf_evlist__save_aggr_prev_raw_counts(struct evlist *evlist); int perf_stat_process_counter(struct perf_stat_config *config, struct evsel *counter); -- 2.17.1
[PATCH v6 3/5] perf stat: Copy counts from prev_raw_counts to evsel->counts
It would be useful to support the overall statistics for perf-stat interval mode. For example, report the summary at the end of "perf-stat -I" output. But since perf-stat can support many aggregation modes, such as --per-thread, --per-socket, -M and etc, we need a solution which doesn't bring much complexity. The idea is to use 'evsel->prev_raw_counts' which is updated in each interval and it's saved with the latest counts. Before reporting the summary, we copy the counts from evsel->prev_raw_counts to evsel->counts, and next we just follow non-interval processing. v5: --- Don't save the previous aggr value to the member of [cpu0,thread0] in perf_counts. Originally that was a trick because the perf_stat_process_counter would create aggr values from per cpu values. But we don't need to do that all the time. We will handle it in next patch. v4: --- Change the commit message. No functional change. Signed-off-by: Jin Yao --- tools/perf/util/stat.c | 24 tools/perf/util/stat.h | 1 + 2 files changed, 25 insertions(+) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index e397815f0dfb..aadc723ce871 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -225,6 +225,30 @@ void perf_evlist__reset_prev_raw_counts(struct evlist *evlist) evsel__reset_prev_raw_counts(evsel); } +static void perf_evsel__copy_prev_raw_counts(struct evsel *evsel) +{ + int ncpus = evsel__nr_cpus(evsel); + int nthreads = perf_thread_map__nr(evsel->core.threads); + + for (int thread = 0; thread < nthreads; thread++) { + for (int cpu = 0; cpu < ncpus; cpu++) { + *perf_counts(evsel->counts, cpu, thread) = + *perf_counts(evsel->prev_raw_counts, cpu, +thread); + } + } + + evsel->counts->aggr = evsel->prev_raw_counts->aggr; +} + +void perf_evlist__copy_prev_raw_counts(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) + perf_evsel__copy_prev_raw_counts(evsel); +} + static void zero_per_pkg(struct evsel *counter) { if (counter->per_pkg_mask) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index b4fdfaa7f2c0..62cf72c71869 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -198,6 +198,7 @@ int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw); void perf_evlist__free_stats(struct evlist *evlist); void perf_evlist__reset_stats(struct evlist *evlist); void perf_evlist__reset_prev_raw_counts(struct evlist *evlist); +void perf_evlist__copy_prev_raw_counts(struct evlist *evlist); int perf_stat_process_counter(struct perf_stat_config *config, struct evsel *counter); -- 2.17.1
[PATCH v6 2/5] perf counts: Reset prev_raw_counts counts
When we want to reset the evsel->prev_raw_counts, zeroing the aggr is not enough, we need to reset the perf_counts too. The perf_counts__reset zeros the perf_counts, and it should zero the aggr too. This patch changes perf_counts__reset to non-static, and calls it in evsel__reset_prev_raw_counts to reset the prev_raw_counts. v4: --- Zeroing the aggr in perf_counts__reset and use it to reset prev_raw_counts. Signed-off-by: Jin Yao --- tools/perf/util/counts.c | 4 +++- tools/perf/util/counts.h | 1 + tools/perf/util/stat.c | 7 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c index 615c9f3e95cb..582f3aeaf5e4 100644 --- a/tools/perf/util/counts.c +++ b/tools/perf/util/counts.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "evsel.h" #include "counts.h" #include @@ -42,10 +43,11 @@ void perf_counts__delete(struct perf_counts *counts) } } -static void perf_counts__reset(struct perf_counts *counts) +void perf_counts__reset(struct perf_counts *counts) { xyarray__reset(counts->loaded); xyarray__reset(counts->values); + memset(>aggr, 0, sizeof(struct perf_counts_values)); } void evsel__reset_counts(struct evsel *evsel) diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h index 8f556c6d98fa..7ff36bf6d644 100644 --- a/tools/perf/util/counts.h +++ b/tools/perf/util/counts.h @@ -37,6 +37,7 @@ perf_counts__set_loaded(struct perf_counts *counts, int cpu, int thread, bool lo struct perf_counts *perf_counts__new(int ncpus, int nthreads); void perf_counts__delete(struct perf_counts *counts); +void perf_counts__reset(struct perf_counts *counts); void evsel__reset_counts(struct evsel *evsel); int evsel__alloc_counts(struct evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index f4a44df9b221..e397815f0dfb 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -163,11 +163,8 @@ static void evsel__free_prev_raw_counts(struct evsel *evsel) static void evsel__reset_prev_raw_counts(struct evsel *evsel) { - if (evsel->prev_raw_counts) { - evsel->prev_raw_counts->aggr.val = 0; - evsel->prev_raw_counts->aggr.ena = 0; - evsel->prev_raw_counts->aggr.run = 0; - } + if (evsel->prev_raw_counts) + perf_counts__reset(evsel->prev_raw_counts); } static int evsel__alloc_stats(struct evsel *evsel, bool alloc_raw) -- 2.17.1
Re: [RFC][PATCH 3/5] thermal: Add support for setting notification thresholds
On Tue, May 19, 2020 at 5:10 AM Srinivas Pandruvada wrote: > > On Mon, 2020-05-18 at 18:37 +0200, Daniel Lezcano wrote: > > On 04/05/2020 20:16, Srinivas Pandruvada wrote: > > > Add new attributes in thermal syfs when a thermal drivers provides > > > callbacks for them and CONFIG_THERMAL_USER_EVENT_INTERFACE is > > > defined. > > > > > > These attribute allow user space to stop polling for temperature. > > > > > > These attributes are: > > > - temp_thres_low: Specify a notification temperature for a low > > > temperature threshold event. > > > temp_thres_high: Specify a notification temperature for a high > > > temperature threshold event. > > > temp_thres_hyst: Specify a change in temperature to send > > > notification > > > again. > > > > > > This is implemented by adding additional sysfs attribute group. The > > > changes in this patch are trivial to add new attributes in thermal > > > sysfs as done for other attributes. > > > > Isn't it duplicate with the trip point? > A trip point is where an in-kernel governor takes some action. This is > not same as a notification temperature. For example at trip point > configured by ACPI at 85C, the thermal governor may start aggressive > throttling. > But a user space can set a notification threshold at 80C and start some > active controls like activate some fan to reduce the impact of passive > control on performance. Then what is the use of thermal trip type "ACTIVE" ? > We need a way to distinguish between temperature notification threshold > and actual trip point. Changing a trip point means that user wants > kernel to throttle at temperature.
Re: [PATCH v1 4/4] of: platform: Batch fwnode parsing when adding all top level devices
Hi Saravana, On 19.05.2020 20:02, Saravana Kannan wrote: > On Tue, May 19, 2020 at 3:32 AM Marek Szyprowski > wrote: >> On 19.05.2020 09:11, Marek Szyprowski wrote: >>> On 19.05.2020 08:48, Saravana Kannan wrote: On Mon, May 18, 2020 at 11:25 PM Marek Szyprowski wrote: > On 15.05.2020 07:35, Saravana Kannan wrote: >> The fw_devlink_pause() and fw_devlink_resume() APIs allow batching the >> parsing of the device tree nodes when a lot of devices are added. This >> will significantly cut down parsing time (as much a 1 second on some >> systems). So, use them when adding devices for all the top level >> device >> tree nodes in a system. >> >> Signed-off-by: Saravana Kannan > This patch recently landed in linux-next 20200518. Sadly, it causes > regression on Samsung Exynos5433-based TM2e board: > > ... > > Both issues, the lack of DMA for SPI device and Synchronous abort in > I2S > probe are new after applying this patch. I'm trying to investigate > which > resources are missing and why. The latter issue means typically that > the > registers for the given device has been accessed without enabling the > needed clocks or power domains. Did you try this copy-pasta fix that I sent later? https://lore.kernel.org/lkml/20200517173453.157703-1-sarava...@google.com/ Not every system would need it (my test setup didn't), but it helps some cases. If that fix doesn't help, then some tips for debugging the failing drivers. What this pause/resume patch effectively (not explicitly) does is: 1. Doesn't immediately probe the devices as they are added in of_platform_default_populate_init() 2. Adds them in order to the deferred probe list. 3. Then kicks off deferred probe on them in the order they were added. These drivers are just not handling -EPROBE_DEFER correctly or assuming probe order and that's causing these issues. So, we can either fix that or you can try adding some code to flush the deferred probe workqueue at the end of fw_devlink_resume(). Let me know how it goes. >>> So far it looks that your patch revealed a hidden issue in exynos5433 >>> clocks configuration, because adding clk_ignore_unused parameter to >>> kernel command line fixes the boot. I'm still investigating it, so >>> probable you can ignore my regression report. I will let you know asap >>> I finish checking it. >>> >> Okay, I confirm that the issue is in the Exynos I2S driver and >> Exynos5433 clock provider. I've posted a quick workaround. I'm sorry for >> the noise, your patch is fine. > Thanks for debugging and finding the real issue. I tried finding your > patches, but couldn't. Can you point me to a lore.kernel.org link? I'm > just curious to see what the issue was. https://lore.kernel.org/linux-samsung-soc/f67db8c1-453b-4c70-67b9-59762ac34...@kernel.org/T/#t It looks that one more clock has to be enabled to properly read init configuration. So far it worked, because that device was probed much earlier, before the unused clocks are turned off. Your patch changed the probe order, so that device is probed later. > I'm guessing you didn't need to pick up this one? > https://lore.kernel.org/lkml/20200517173453.157703-1-sarava...@google.com/ Best regards -- Marek Szyprowski, PhD Samsung R Institute Poland
Re: [PATCH] printk/kdb: Redirect printk messages into kdb in any context
On (20/05/18 11:21), Petr Mladek wrote: [..] > > > Is this guaranteed that we never execute this path from NMI? > > Good question! > > > Absolutely not. > > > > The execution context for kdb is pretty much unique... we are running a > > debug mode with all CPUs parked in a holding loop with interrupts > > disabled. One CPU is at an unknown exception state and the others are > > either handling an IRQ or NMI depending on architecture[1]. > > This is similar to the situation in panic() when other CPUs are > stopped. It is more safe when the CPUs are stopped using IRQ. > There is higher danger of a deadlock when NMI is used. > > bust_spinlock() is used in panic() to increase the chance to go over > the deadlock and actually see the messages. It is not enough when > more locks are used by the console (VT/TTY is good example). And > it is not guaranteed that the console will still work after > the hack is disabled by bust_spinlocks(0). Good point. It's not guaranteed to help, but bust_spinlocks() does help in general, many serial drivers do check oops_in_progress and use a deadlock safe approach when locking port lock. I don't see bust_spinlocks() being used in kdb, so it probably better start doing so (along with general for_each_console() loop improvements, like checking if console is enabled/available/etc). [..] > > > If so, can this please be added to the commit message? A more > > > detailed commit message will help a lot. > > What about? > > "KDB has to get messages on consoles even when the system is stopped. > It uses kdb_printf() internally and calls console drivers on its own. > > It uses a hack to reuse an existing code. It sets "kdb_trap_printk" > global variable to redirect even the normal printk() into the > kdb_printf() variant. > > The variable "kdb_trap_printk" is checked in printk_default() and > it is ignored when printk is redirected to printk_safe in NMI context. > Solve this by moving the check into printk_func(). > > It is obvious that it is not fully safe. But it does not make things > worse. The console drivers are already called in this context by > kdb_printf() direct calls." This looks more informative indeed. Thanks! -ss
Re: linux-next: manual merge of the rcu tree with the powerpc tree
Stephen Rothwell writes: > Hi all, > > Today's linux-next merge of the rcu tree got a conflict in: > > arch/powerpc/kernel/traps.c > > between commit: > > 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI accounting") > > from the powerpc tree and commit: > > 187416eeb388 ("hardirq/nmi: Allow nested nmi_enter()") > > from the rcu tree. > > I fixed it up (I used the powerpc tree version for now) and can carry the > fix as necessary. OK, I guess that works for now, we'll have to clean it up later once both trees are merged upstream. I created an issue to track it: https://github.com/linuxppc/issues/issues/298 cheers
[PATCH v3 1/2] arm64: dts: add qe node to ls1043ardb
From: Zhao Qiang Add qe node to fsl-ls1043a.dtsi and fsl-ls1043a-rdb.dts Signed-off-by: Zhao Qiang --- v2: - add commit msg and drop a new blank line v3: - Keep labeling node sort alphabetically - remove unused device_type - use GIC_SPI and IRQ_TYPE_LEVEL_HIGH - use "arm64: dts:" format for subject arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts | 16 ++ arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi| 65 +++ 2 files changed, 81 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts index dde50c8..44d9343 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts @@ -176,3 +176,19 @@ }; }; }; + + { + ucc_hdlc: ucc@2000 { + compatible = "fsl,ucc-hdlc"; + rx-clock-name = "clk8"; + tx-clock-name = "clk9"; + fsl,rx-sync-clock = "rsync_pin"; + fsl,tx-sync-clock = "tsync_pin"; + fsl,tx-timeslot-mask = <0xfffe>; + fsl,rx-timeslot-mask = <0xfffe>; + fsl,tdm-framer-type = "e1"; + fsl,tdm-id = <0>; + fsl,siram-entry-id = <0>; + fsl,tdm-interface; + }; +}; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index c084c7a4..3b641bd 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -525,6 +525,71 @@ #interrupt-cells = <2>; }; + uqe: uqe@240 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,qe", "simple-bus"; + ranges = <0x0 0x0 0x240 0x4>; + reg = <0x0 0x240 0x0 0x480>; + brg-frequency = <1>; + bus-frequency = <2>; + fsl,qe-num-riscs = <1>; + fsl,qe-num-snums = <28>; + + qeic: qeic@80 { + compatible = "fsl,qe-ic"; + reg = <0x80 0x80>; + #address-cells = <0>; + interrupt-controller; + #interrupt-cells = <1>; + interrupts = , +; + }; + + si1: si@700 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,ls1043-qe-si", + "fsl,t1040-qe-si"; + reg = <0x700 0x80>; + }; + + siram1: siram@1000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,ls1043-qe-siram", + "fsl,t1040-qe-siram"; + reg = <0x1000 0x800>; + }; + + ucc@2000 { + cell-index = <1>; + reg = <0x2000 0x200>; + interrupts = <32>; + interrupt-parent = <>; + }; + + ucc@2200 { + cell-index = <3>; + reg = <0x2200 0x200>; + interrupts = <34>; + interrupt-parent = <>; + }; + + muram@1 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,qe-muram", "fsl,cpm-muram"; + ranges = <0x0 0x1 0x6000>; + + data-only@0 { + compatible = "fsl,qe-muram-data", + "fsl,cpm-muram-data"; + reg = <0x0 0x6000>; + }; + }; + }; + lpuart0: serial@295 { compatible = "fsl,ls1021a-lpuart"; reg = <0x0 0x295 0x0 0x1000>; -- 2.7.4
[PATCH v3 2/2] arm64: dts: Add ds26522 node to dts to ls1043ardb
From: Zhao Qiang Add ds26522 node to fsl-ls1043a-rdb.dts Signed-off-by: Zhao Qiang --- v3: - use "arm64: dts:" format for subject arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts | 16 1 file changed, 16 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts index 44d9343..1cb265f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts @@ -94,6 +94,22 @@ reg = <0>; spi-max-frequency = <100>; /* input clock */ }; + + slic@2 { + compatible = "maxim,ds26522"; + reg = <2>; + spi-max-frequency = <200>; + fsl,spi-cs-sck-delay = <100>; + fsl,spi-sck-cs-delay = <50>; + }; + + slic@3 { + compatible = "maxim,ds26522"; + reg = <3>; + spi-max-frequency = <200>; + fsl,spi-cs-sck-delay = <100>; + fsl,spi-sck-cs-delay = <50>; + }; }; { -- 2.7.4
Re: [PATCH 2/2] Add a new sysctl knob: unprivileged_userfaultfd_user_mode_only
Hello Jonathan and everyone, On Thu, May 07, 2020 at 01:15:03PM -0600, Jonathan Corbet wrote: > On Wed, 6 May 2020 15:38:16 -0400 > Peter Xu wrote: > > > If this is going to be added... I am thinking whether it should be easier to > > add another value for unprivileged_userfaultfd, rather than a new sysctl. > > E.g.: > > > > "0": unprivileged userfaultfd forbidden > > "1": unprivileged userfaultfd allowed (both user/kernel faults) > > "2": unprivileged userfaultfd allowed (only user faults) > > > > Because after all unprivileged_userfaultfd_user_mode_only will be > > meaningless > > (iiuc) if unprivileged_userfaultfd=0. The default value will also be the > > same > > as before ("1") > It occurs to me to wonder whether this interface should also let an admin > block *privileged* user from handling kernel-space faults? In a > secure-boot/lockdown setting, this could be a hardening measure that keeps > a (somewhat) restricted root user from expanding their privilege...? That's a good question. In my view if as root in lockdown mode you can still run the swapon syscall and setup nfs or other network devices and load userland fuse filesystems or cuse chardev in userland, even if you prevent userfaultfd from blocking kernel faults, kernel faults can still be blocked by other means. That in fact tends to be true also as non root (so regardless of lockdown settings) since luser can generally load fuse filesystems. There is no fundamental integrity breakage or privilege escalation originating in userfaultfd. The only concern here is about this: "after a new use-after-free is discovered in some other part of the kernel (not related to userfaultfd), how easy it is to turn the use-after-free from a mere DoS to a more concerning privilege escalation?". userfaultfd might facilitate the exploitation, but even if you remove userfaultfd from the equation, there's still no guarantee an user-after-free won't materialize as a privilege escalation by other means. So to express it in another way: unless lockdown (no matter in which mode) is a weak probabilistic based feature and in turn it cannot provide any guarantee to begin with, userfaultfd sysctl set to 0|1|2 can't possibly make any difference to it. The best mitigation for those kind of exploits remains to randomize all kernel memory allocations, so even if the attacker can block the fault, when it's unblocked it'll pick another page, not the one that the attacker can predict it will use, so the attacker needs to repeat the race many more times and hopefully it'll DoS and destabilize the kernel before it can reproduce a privilege escalation. We got many of those randomization features in the current kernel and it's probably more important to enable those than to worry about this sysctl value. One way to have a peace of mind against all use-after-free regardless of this sysctl value, is to run each pod in a KVM instance, that's safer than disabling syscalls or kernel features. The default seccomp profiles of podman already block userfaultfd too, so there's no need of virt to get extra safety if you use containers: containers need to explicitly opt-in to enable userfaultfd through the OCI schema seccomp object. If userfaultfd is being explicitly whitelisted in the OCI schema of the container, well then you know there is a good reason for it. As a matter of fact some things are only possible to achieve with userfaultfd fully enabled. The big value uffd brings compared to trapping sigsegv is precisely to be able to handle kernel faults transparently. sigsegv can't do that because every syscall would return 1) an inconsistent retval and 2) no fault address along with the retval. The possible future uffd userland users could be: dropping JVM dirty bit, redis snapshot using pthread_create() instead of fork(), distributed shared memory on pmem, new malloc() implementation never taking mmap_sem for writing in the kernel and never modifying any vma to allocate and free anon memory, etc.. I don't think any of them would work with the sysctl set to "2". The next kernel feature in uffd land that I was discussing with Peter, is an async uffd event model to further optimize the replacement of soft-dirty (which uffd already provides in O(1) instead of O(N)), so the wrprotect fault won't have to block anymore until the uffd async queue overflows. That also is unlikely to work with the sysctl set to "2" without adding extra constraints that soft-dirty doesn't currently have. It would also be possible to implement the value "2" to work like /proc/sys/kernel/unprivileged_bpf_disabled, so when you set it to "1" as root, you can't set it to "2" or "0" and when you set it to "2" you can't set it to "0", but personally I think it's unnecessary. Thanks, Andrea
Re: [PATCH v4 2/4] kasan: record and print the free track
> On Tue, May 19, 2020 at 4:25 AM Walter Wu wrote: > > > > Move free track from slub alloc meta-data to slub free meta-data in > > order to make struct kasan_free_meta size is 16 bytes. It is a good > > size because it is the minimal redzone size and a good number of > > alignment. > > > > For free track in generic KASAN, we do the modification in struct > > kasan_alloc_meta and kasan_free_meta: > > - remove free track from kasan_alloc_meta. > > - add free track into kasan_free_meta. > > > > [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 > > > > Signed-off-by: Walter Wu > > Suggested-by: Dmitry Vyukov > > Cc: Andrey Ryabinin > > Cc: Dmitry Vyukov > > Cc: Alexander Potapenko > > --- > > mm/kasan/common.c | 22 ++ > > mm/kasan/generic.c | 18 ++ > > mm/kasan/kasan.h | 7 +++ > > mm/kasan/report.c | 20 > > mm/kasan/tags.c| 37 + > > 5 files changed, 64 insertions(+), 40 deletions(-) > > > > diff --git a/mm/kasan/common.c b/mm/kasan/common.c > > index 8bc618289bb1..47b53912f322 100644 > > --- a/mm/kasan/common.c > > +++ b/mm/kasan/common.c > > @@ -51,7 +51,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags) > > return stack_depot_save(entries, nr_entries, flags); > > } > > > > -static inline void set_track(struct kasan_track *track, gfp_t flags) > > +void kasan_set_track(struct kasan_track *track, gfp_t flags) > > { > > track->pid = current->pid; > > track->stack = kasan_save_stack(flags); > > @@ -299,24 +299,6 @@ struct kasan_free_meta *get_free_info(struct > > kmem_cache *cache, > > return (void *)object + cache->kasan_info.free_meta_offset; > > } > > > > - > > -static void kasan_set_free_info(struct kmem_cache *cache, > > - void *object, u8 tag) > > -{ > > - struct kasan_alloc_meta *alloc_meta; > > - u8 idx = 0; > > - > > - alloc_meta = get_alloc_info(cache, object); > > - > > -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY > > - idx = alloc_meta->free_track_idx; > > - alloc_meta->free_pointer_tag[idx] = tag; > > - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; > > -#endif > > - > > - set_track(_meta->free_track[idx], GFP_NOWAIT); > > -} > > - > > void kasan_poison_slab(struct page *page) > > { > > unsigned long i; > > @@ -492,7 +474,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, > > const void *object, > > KASAN_KMALLOC_REDZONE); > > > > if (cache->flags & SLAB_KASAN) > > - set_track(_alloc_info(cache, object)->alloc_track, > > flags); > > + kasan_set_track(_alloc_info(cache, > > object)->alloc_track, flags); > > > > return set_tag(object, tag); > > } > > diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c > > index 3372bdcaf92a..763d8a13e0ac 100644 > > --- a/mm/kasan/generic.c > > +++ b/mm/kasan/generic.c > > @@ -344,3 +344,21 @@ void kasan_record_aux_stack(void *addr) > > alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; > > alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); > > } > > + > > +void kasan_set_free_info(struct kmem_cache *cache, > > + void *object, u8 tag) > > +{ > > + struct kasan_free_meta *free_meta; > > + > > + free_meta = get_free_info(cache, object); > > + kasan_set_track(_meta->free_track, GFP_NOWAIT); > > +} > > + > > +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, > > + void *object, u8 tag) > > +{ > > + struct kasan_free_meta *free_meta; > > + > > + free_meta = get_free_info(cache, object); > > + return _meta->free_track; > > +} > > diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h > > index a7391bc83070..ad897ec36545 100644 > > --- a/mm/kasan/kasan.h > > +++ b/mm/kasan/kasan.h > > @@ -127,6 +127,9 @@ struct kasan_free_meta { > > * Otherwise it might be used for the allocator freelist. > > */ > > struct qlist_node quarantine_link; > > +#ifdef CONFIG_KASAN_GENERIC > > + struct kasan_track free_track; > > +#endif > > }; > > > > struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, > > @@ -168,6 +171,10 @@ void kasan_report_invalid_free(void *object, unsigned > > long ip); > > struct page *kasan_addr_to_page(const void *addr); > > > > depot_stack_handle_t kasan_save_stack(gfp_t flags); > > +void kasan_set_track(struct kasan_track *track, gfp_t flags); > > +void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); > > +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, > > + void *object, u8 tag); > > > > #if defined(CONFIG_KASAN_GENERIC) && \ > > (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) > > diff --git a/mm/kasan/report.c b/mm/kasan/report.c > > index 6f8f2bf8f53b..96d2657fe70f 100644 > > --- a/mm/kasan/report.c > >
Re: [PATCH v4 0/4] cleaning up the sysctls table (hung_task watchdog)
On 2020/5/20 11:31, Andrew Morton wrote: On Tue, 19 May 2020 11:31:07 +0800 Xiaoming Ni wrote: Kernel/sysctl.c eek! fs/proc/proc_sysctl.c| 2 +- include/linux/sched/sysctl.h | 14 +-- include/linux/sysctl.h | 13 ++- kernel/hung_task.c | 77 +++- kernel/sysctl.c | 214 +++ kernel/watchdog.c| 101 6 files changed, 224 insertions(+), 197 deletions(-) Here's what we presently have happening in linux-next's kernel/sysctl.c: sysctl.c | 3109 ++- 1 file changed, 1521 insertions(+), 1588 deletions(-) So this is not a good time for your patch! Can I suggest that you set the idea aside and take a look after 5.8-rc1 is released? ok, I will make v5 patch based on 5.8-rc1 after 5.8-rc1 is released, And add more sysctl table cleanup. Thanks Xiaoming Ni
Re: [PATCH 00/12] Add cpufreq and cci devfreq for mt8183, and SVS support
Hi Andrew, Could you explain the base commit of these patches? When I tried to apply them to v5.7-rc1 for testing, the merge conflict occurs. Thanks, Chanwoo Choi On 5/20/20 12:42 PM, Andrew-sh.Cheng wrote: > MT8183 supports CPU DVFS and CCI DVFS, and LITTLE cpus and CCI are in the > same voltage domain. > So, this series is to add drivers to handle the voltage coupling between CPU > and CCI DVFS. > > For SVS support, need OPP_EVENT_ADJUST_VOLTAGE and corresponding reaction. > > Change since v5: > - Changing dt-binding format to yaml. > - Extending current devfreq passive_governor instead of create a new > one. > - Resend depending patches of Sravana Kannan base on kernel-5.7 > > > Andrew-sh.Cheng (6): > cpufreq: mediatek: add clock and regulator enable for intermediate > clock > dt-bindings: devfreq: add compatible for mt8183 cci devfreq > devfreq: add mediatek cci devfreq > opp: Modify opp API, dev_pm_opp_get_freq(), find freq in opp, even it > is disabled > cpufreq: mediatek: add opp notification for SVS support > devfreq: mediatek: cci devfreq register opp notification for SVS > support > > Saravana Kannan (6): > OPP: Allow required-opps even if the device doesn't have power-domains > OPP: Add function to look up required OPP's for a given OPP > OPP: Improve required-opps linking > PM / devfreq: Cache OPP table reference in devfreq > PM / devfreq: Add required OPPs support to passive governor > PM / devfreq: Add cpu based scaling support to passive_governor > > .../devicetree/bindings/devfreq/mt8183-cci.yaml| 51 > drivers/cpufreq/mediatek-cpufreq.c | 122 - > drivers/devfreq/Kconfig| 12 + > drivers/devfreq/Makefile | 1 + > drivers/devfreq/devfreq.c | 6 + > drivers/devfreq/governor_passive.c | 298 > +++-- > drivers/devfreq/mt8183-cci-devfreq.c | 233 > drivers/opp/core.c | 85 +- > drivers/opp/of.c | 108 > drivers/opp/opp.h | 5 + > include/linux/devfreq.h| 42 ++- > include/linux/pm_opp.h | 11 + > 12 files changed, 874 insertions(+), 100 deletions(-) > create mode 100644 Documentation/devicetree/bindings/devfreq/mt8183-cci.yaml > create mode 100644 drivers/devfreq/mt8183-cci-devfreq.c >
Endless soft-lockups for compiling workload since next-20200519
Just a head up. Repeatedly compiling kernels for a while would trigger endless soft-lockups since next-20200519 on both x86_64 and powerpc. .config are in, https://github.com/cailca/linux-mm I did first try to revert the linux-next commit 68cd9f4e7238 ("tick/nohz: Narrow down noise while setting current task's tick dependency"), but it did not help. == x86_64 == [ 1167.993773][C1] WARNING: CPU: 1 PID: 0 at kernel/smp.c:127 flush_smp_call_function_queue+0x1fa/0x2e0 [ 1168.00][C1] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat kvm_amd ses kvm enclosure dax_pmem irqbypass dax_pmem_core efivars acpi_cpufreq efivarfs ip_tables x_tables xfs sd_mod smartpqi scsi_transport_sas tg3 mlx5_core libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod [ 1168.029492][C1] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.7.0-rc6-next-20200519 #1 [ 1168.037665][C1] Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 [ 1168.046978][C1] RIP: 0010:flush_smp_call_function_queue+0x1fa/0x2e0 [ 1168.053658][C1] Code: 01 0f 87 c9 12 00 00 83 e3 01 0f 85 cc fe ff ff 48 c7 c7 c0 55 a9 8f c6 05 f6 86 cd 01 01 e8 de 09 ea ff 0f 0b e9 b2 fe ff ff <0f> 0b e9 52 ff ff ff 0f 0b e9 f2 fe ff ff 65 44 8b 25 10 52 3f 71 [ 1168.073262][C1] RSP: 0018:c9178918 EFLAGS: 00010046 [ 1168.079253][C1] RAX: RBX: 430c58f8 RCX: 8ec26083 [ 1168.087156][C1] RDX: 0003 RSI: dc00 RDI: 430c58f8 [ 1168.095054][C1] RBP: c91789a8 R08: ed1108618cec R09: ed1108618cec [ 1168.102964][C1] R10: 430c675b R11: R12: 430c58e0 [ 1168.110866][C1] R13: 8eb30c40 R14: 430c5880 R15: 430c58e0 [ 1168.118767][C1] FS: () GS:4308() knlGS: [ 1168.127628][C1] CS: 0010 DS: ES: CR0: 80050033 [ 1168.134129][C1] CR2: 55b169604560 CR3: 000d08a14000 CR4: 003406e0 [ 1168.142026][C1] Call Trace: [ 1168.145206][C1] [ 1168.147957][C1] ? smp_call_on_cpu_callback+0xd0/0xd0 [ 1168.153421][C1] ? rcu_read_lock_sched_held+0xac/0xe0 [ 1168.158880][C1] ? rcu_read_lock_bh_held+0xc0/0xc0 [ 1168.164076][C1] generic_smp_call_function_single_interrupt+0x13/0x2b [ 1168.170938][C1] smp_call_function_single_interrupt+0x157/0x4e0 [ 1168.177278][C1] ? smp_call_function_interrupt+0x4e0/0x4e0 [ 1168.183172][C1] ? interrupt_entry+0xe4/0xf0 [ 1168.187846][C1] ? trace_hardirqs_off_caller+0x8d/0x1f0 [ 1168.193478][C1] ? trace_hardirqs_on_caller+0x1f0/0x1f0 [ 1168.199116][C1] ? _nohz_idle_balance+0x221/0x360 [ 1168.204228][C1] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 1168.209690][C1] call_function_single_interrupt+0xf/0x20 [ 1168.215415][C1] RIP: 0010:_raw_spin_unlock_irqrestore+0x46/0x50 [ 1168.221747][C1] Code: 8d 5e ff 4c 89 e7 e8 a9 35 5f ff f6 c7 02 75 13 53 9d e8 fd c0 6f ff 65 ff 0d 4e ab a6 70 5b 41 5c 5d c3 e8 dc c2 6f ff 53 9d eb 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 65 ff 05 2b ab a6 [ 1168.241353][C1] RSP: 0018:c9178bd0 EFLAGS: 0246 ORIG_RAX: ff04 [ 1168.249700][C1] RAX: RBX: 0246 RCX: 8eba0740 [ 1168.257602][C1] RDX: 0007 RSI: dc00 RDI: 888214f5c8e4 [ 1168.265503][C1] RBP: c9178be0 R08: fbfff2120216 R09: [ 1168.273400][C1] R10: R11: R12: 43145880 [ 1168.281300][C1] R13: 90b2db80 R14: 0002 R15: 0001000164cb [ 1168.289218][C1] ? call_function_single_interrupt+0xa/0x20 [ 1168.295117][C1] ? lockdep_hardirqs_on+0x1b0/0x2c0 [ 1168.300319][C1] _nohz_idle_balance+0x221/0x360 [ 1168.305256][C1] run_rebalance_domains+0x16c/0x2e0 [ 1168.310452][C1] __do_softirq+0x1ca/0x96a [ 1168.314861][C1] ? __irqentry_text_end+0x1fa9e7/0x1fa9e7 [ 1168.320579][C1] ? hrtimer_reprogram+0x170/0x170 [ 1168.325608][C1] ? __bpf_trace_preemptirq_template+0x100/0x100 [ 1168.331856][C1] ? lapic_next_event+0x3c/0x50 [ 1168.336617][C1] ? clockevents_program_event+0xfc/0x180 [ 1168.342249][C1] ? check_flags.part.28+0x86/0x220 [ 1168.347355][C1] ? trace_hardirqs_off+0x8d/0x1f0 [ 1168.352374][C1] ? __bpf_trace_preemptirq_template+0x100/0x100 [ 1168.358620][C1] ? rcu_read_lock_sched_held+0xac/0xe0 [ 1168.364077][C1] ? rcu_read_lock_bh_held+0xc0/0xc0 [ 1168.369282][C1] irq_exit+0xd6/0xf0 [ 1168.373168][C1] smp_apic_timer_interrupt+0x215/0x560 [ 1168.378628][C1] ? smp_call_function_single_interrupt+0x4e0/0x4e0 [ 1168.385137][C1] ? smp_call_function_interrupt+0x4e0/0x4e0 [ 1168.391031][C1] ? interrupt_entry+0xe4/0xf0 [ 1168.395705][C1] ? trace_hardirqs_off_caller+0x8d/0x1f0 [ 1168.401336][C1] ? trace_hardirqs
[tip:x86/entry 23/80] arch/x86/entry/common.c:234:24: warning: no previous prototype for function 'prepare_exit_to_usermode'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/entry head: 095b7a3e7745e6fb7cf0a1c09967c4f43e76f8f4 commit: aa9712e07f82a5458f2f16c100c491d736240d60 [23/80] x86/entry/common: Protect against instrumentation config: x86_64-allyesconfig (attached as .config) compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project e6658079aca6d971b4e9d7137a3a2ecbc9c34aec) reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install x86_64 cross compiling tool for clang build # apt-get install binutils-x86-64-linux-gnu git checkout aa9712e07f82a5458f2f16c100c491d736240d60 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot All warnings (new ones prefixed by >>, old ones prefixed by <<): >> arch/x86/entry/common.c:234:24: warning: no previous prototype for function >> 'prepare_exit_to_usermode' [-Wmissing-prototypes] __visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) ^ arch/x86/entry/common.c:234:19: note: declare 'static' if the function is not intended to be used outside of this translation unit __visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) ^ static >> arch/x86/entry/common.c:296:24: warning: no previous prototype for function >> 'syscall_return_slowpath' [-Wmissing-prototypes] __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) ^ arch/x86/entry/common.c:296:19: note: declare 'static' if the function is not intended to be used outside of this translation unit __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) ^ static 2 warnings generated. vim +/prepare_exit_to_usermode +234 arch/x86/entry/common.c 233 > 234 __visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) 235 { 236 instrumentation_begin(); 237 __prepare_exit_to_usermode(regs); 238 instrumentation_end(); 239 exit_to_user_mode(); 240 } 241 242 #define SYSCALL_EXIT_WORK_FLAGS \ 243 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 244 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) 245 246 static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) 247 { 248 bool step; 249 250 audit_syscall_exit(regs); 251 252 if (cached_flags & _TIF_SYSCALL_TRACEPOINT) 253 trace_sys_exit(regs, regs->ax); 254 255 /* 256 * If TIF_SYSCALL_EMU is set, we only get here because of 257 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 258 * We already reported this syscall instruction in 259 * syscall_trace_enter(). 260 */ 261 step = unlikely( 262 (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) 263 == _TIF_SINGLESTEP); 264 if (step || cached_flags & _TIF_SYSCALL_TRACE) 265 tracehook_report_syscall_exit(regs, step); 266 } 267 268 static void __syscall_return_slowpath(struct pt_regs *regs) 269 { 270 struct thread_info *ti = current_thread_info(); 271 u32 cached_flags = READ_ONCE(ti->flags); 272 273 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 274 275 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 276 WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) 277 local_irq_enable(); 278 279 rseq_syscall(regs); 280 281 /* 282 * First do one-time work. If these work items are enabled, we 283 * want to run them exactly once per syscall exit with IRQs on. 284 */ 285 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) 286 syscall_slow_exit_work(regs, cached_flags); 287 288 local_irq_disable(); 289 __prepare_exit_to_usermode(regs); 290 } 291 292 /* 293 * Called with IRQs on and fully valid regs. Returns with IRQs off in a 294 * state such that we can immediately switch to user mode. 295 */ > 296 __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) 297 { 298 instrumentation_begin(); 299 __syscall_return_slowpath(regs); 300 instrumentation_end(); 301 exit_to_user_mode(); 302 } 303 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
[PATCH v11] arm64: dts: qcom: sc7180: Add WCN3990 WLAN module device node
Add device node for the ath10k SNOC platform driver probe and add resources required for WCN3990 on sc7180 soc. Signed-off-by: Rakesh Pillai --- Changes from v10: - Corrected the position of wifi node, as per address - Removed the wlan_fw_mem from reserved memory, since its already added as reserved memory in board DT file. --- arch/arm64/boot/dts/qcom/sc7180-idp.dts | 7 +++ arch/arm64/boot/dts/qcom/sc7180.dtsi| 22 ++ 2 files changed, 29 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sc7180-idp.dts b/arch/arm64/boot/dts/qcom/sc7180-idp.dts index 4e9149d..38b102e 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-idp.dts +++ b/arch/arm64/boot/dts/qcom/sc7180-idp.dts @@ -389,6 +389,13 @@ }; }; + { + status = "okay"; + wifi-firmware { + iommus = <_smmu 0xc2 0x1>; + }; +}; + /* PINCTRL - additions to nodes defined in sc7180.dtsi */ _clk { diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi index 6b12c60..da79f8f 100644 --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi @@ -2811,6 +2811,28 @@ #freq-domain-cells = <1>; }; + + wifi: wifi@1880 { + compatible = "qcom,wcn3990-wifi"; + reg = <0 0x1880 0 0x80>; + reg-names = "membase"; + iommus = <_smmu 0xc0 0x1>; + interrupts = + , + , + , + , + , + , + , + , + , + , + , + ; + memory-region = <_mem>; + status = "disabled"; + }; }; thermal-zones { -- 2.7.4
[PATCH v2 4/4] driver core: Add waiting_for_supplier sysfs file for devices
This would be useful to check if a device is not probing because it's waiting for a supplier to be added and then linked to before it can probe. To reduce sysfs clutter, this file is added only if it can ever be 1. So, if fw_devlink is disabled or set to permissive, this file is not added. Also, this file is removed once the device probes as it's no longer relevant. Signed-off-by: Saravana Kannan --- .../sysfs-devices-waiting_for_supplier| 17 drivers/base/core.c | 26 +++ 2 files changed, 43 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-waiting_for_supplier diff --git a/Documentation/ABI/testing/sysfs-devices-waiting_for_supplier b/Documentation/ABI/testing/sysfs-devices-waiting_for_supplier new file mode 100644 index ..59d073d20db6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-waiting_for_supplier @@ -0,0 +1,17 @@ +What: /sys/devices/.../waiting_for_supplier +Date: May 2020 +Contact: Saravana Kannan +Description: + The /sys/devices/.../waiting_for_supplier attribute is only + present when fw_devlink kernel command line option is enabled + and is set to something stricter than "permissive". It is + removed once a device probes successfully (because the + information is no longer relevant). The number read from it (0 + or 1) reflects whether the device is waiting for one or more + suppliers to be added and then linked to using device links + before the device can probe. + + A value of 0 means the device is not waiting for any suppliers + to be added before it can probe. A value of 1 means the device + is waiting for one or more suppliers to be added before it can + probe. diff --git a/drivers/base/core.c b/drivers/base/core.c index 3304ea1a2604..83a3e0b62ce3 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1031,6 +1031,22 @@ static void device_link_drop_managed(struct device_link *link) kref_put(>kref, __device_link_del); } +static ssize_t waiting_for_supplier_show(struct device *dev, +struct device_attribute *attr, +char *buf) +{ + bool val; + + device_lock(dev); + mutex_lock(_lock); + val = !list_empty(>links.needs_suppliers) + && dev->links.need_for_probe; + mutex_unlock(_lock); + device_unlock(dev); + return sprintf(buf, "%u\n", val); +} +static DEVICE_ATTR_RO(waiting_for_supplier); + /** * device_links_driver_bound - Update device links after probing its driver. * @dev: Device to update the links for. @@ -1055,6 +1071,7 @@ void device_links_driver_bound(struct device *dev) mutex_lock(_lock); list_del_init(>links.needs_suppliers); mutex_unlock(_lock); + device_remove_file(dev, _attr_waiting_for_supplier); device_links_write_lock(); @@ -2124,8 +2141,16 @@ static int device_add_attrs(struct device *dev) goto err_remove_dev_groups; } + if (fw_devlink_flags && !fw_devlink_is_permissive()) { + error = device_create_file(dev, _attr_waiting_for_supplier); + if (error) + goto err_remove_dev_online; + } + return 0; + err_remove_dev_online: + device_remove_file(dev, _attr_online); err_remove_dev_groups: device_remove_groups(dev, dev->groups); err_remove_type_groups: @@ -2143,6 +2168,7 @@ static void device_remove_attrs(struct device *dev) struct class *class = dev->class; const struct device_type *type = dev->type; + device_remove_file(dev, _attr_waiting_for_supplier); device_remove_file(dev, _attr_online); device_remove_groups(dev, dev->groups); -- 2.26.2.761.g0e0b3e54be-goog
[PATCH v2 2/4] driver core: Expose device link details in sysfs
It's helpful to be able to look at device link details from sysfs. So, expose it in sysfs. Say device-A is supplier of device-B. These are the additional files this patch would create: /sys/class/devlink/device-A:device-B/ auto_remove_on consumer/ -> .../device-B/ runtime_pm status supplier/ -> .../device-A/ sync_state_only /sys/devices/.../device-A/ consumer:device-B/ -> /sys/class/devlink/device-A:device-B/ /sys/devices/.../device-B/ supplier:device-A/ -> /sys/class/devlink/device-A:device-B/ That way: To get a list of all the device link in the system: ls /sys/class/devlink/ To get the consumer names and links of a device: ls -d /sys/devices/.../device-X/consumer:* To get the supplier names and links of a device: ls -d /sys/devices/.../device-X/supplier:* Signed-off-by: Saravana Kannan --- drivers/base/core.c| 211 +++-- include/linux/device.h | 58 +-- 2 files changed, 233 insertions(+), 36 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 6dbee5885abb..3304ea1a2604 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -235,6 +235,186 @@ void device_pm_move_to_tail(struct device *dev) device_links_read_unlock(idx); } +#define to_devlink(dev)container_of((dev), struct device_link, link_dev) + +static ssize_t status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char *status; + + switch (to_devlink(dev)->status) { + case DL_STATE_NONE: + status = "not tracked"; break; + case DL_STATE_DORMANT: + status = "dormant"; break; + case DL_STATE_AVAILABLE: + status = "available"; break; + case DL_STATE_CONSUMER_PROBE: + status = "consumer probing"; break; + case DL_STATE_ACTIVE: + status = "active"; break; + case DL_STATE_SUPPLIER_UNBIND: + status = "supplier unbinding"; break; + default: + status = "unknown"; break; + } + return sprintf(buf, "%s\n", status); +} +static DEVICE_ATTR_RO(status); + +static ssize_t auto_remove_on_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct device_link *link = to_devlink(dev); + char *str; + + if (link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER) + str = "supplier unbind"; + else if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) + str = "consumer unbind"; + else + str = "never"; + + return sprintf(buf, "%s\n", str); +} +static DEVICE_ATTR_RO(auto_remove_on); + +static ssize_t runtime_pm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct device_link *link = to_devlink(dev); + + return sprintf(buf, "%d\n", !!(link->flags & DL_FLAG_PM_RUNTIME)); +} +static DEVICE_ATTR_RO(runtime_pm); + +static ssize_t sync_state_only_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct device_link *link = to_devlink(dev); + + return sprintf(buf, "%d\n", !!(link->flags & DL_FLAG_SYNC_STATE_ONLY)); +} +static DEVICE_ATTR_RO(sync_state_only); + +static struct attribute *devlink_attrs[] = { + _attr_status.attr, + _attr_auto_remove_on.attr, + _attr_runtime_pm.attr, + _attr_sync_state_only.attr, + NULL, +}; +ATTRIBUTE_GROUPS(devlink); + +static void devlink_dev_release(struct device *dev) +{ + kfree(to_devlink(dev)); +} + +static struct class devlink_class = { + .name = "devlink", + .owner = THIS_MODULE, + .dev_groups = devlink_groups, + .dev_release = devlink_dev_release, +}; + +static int devlink_add_symlinks(struct device *dev, + struct class_interface *class_intf) +{ + int ret; + size_t len; + struct device_link *link = to_devlink(dev); + struct device *sup = link->supplier; + struct device *con = link->consumer; + char *buf; + + len = max(strlen(dev_name(sup)), strlen(dev_name(con))); + len += strlen("supplier:") + 1; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = sysfs_create_link(>link_dev.kobj, >kobj, "supplier"); + if (ret) + goto out; + + ret = sysfs_create_link(>link_dev.kobj, >kobj, "consumer"); + if (ret) + goto err_con; + + snprintf(buf, len, "consumer:%s", dev_name(con)); + ret = sysfs_create_link(>kobj, >link_dev.kobj, buf); + if (ret) + goto err_con_dev; + + snprintf(buf, len, "supplier:%s", dev_name(sup)); + ret = sysfs_create_link(>kobj, >link_dev.kobj, buf); + if (ret) + goto err_sup_dev; + + goto out; + +err_sup_dev: +
[PATCH v2 1/4] driver core: Remove unnecessary is_fwnode_dev variable in device_add()
That variable is no longer necessary. Remove it and also fix a minor typo in comments. Signed-off-by: Saravana Kannan --- drivers/base/core.c | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index f804e561e0a2..6dbee5885abb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2543,7 +2543,6 @@ int device_add(struct device *dev) struct class_interface *class_intf; int error = -EINVAL; struct kobject *glue_dir = NULL; - bool is_fwnode_dev = false; dev = get_device(dev); if (!dev) @@ -2641,11 +2640,6 @@ int device_add(struct device *dev) kobject_uevent(>kobj, KOBJ_ADD); - if (dev->fwnode && !dev->fwnode->dev) { - dev->fwnode->dev = dev; - is_fwnode_dev = true; - } - /* * Check if any of the other devices (consumers) have been waiting for * this device (supplier) to be added so that they can create a device @@ -2654,12 +2648,14 @@ int device_add(struct device *dev) * This needs to happen after device_pm_add() because device_link_add() * requires the supplier be registered before it's called. * -* But this also needs to happe before bus_probe_device() to make sure +* But this also needs to happen before bus_probe_device() to make sure * waiting consumers can link to it before the driver is bound to the * device and the driver sync_state callback is called for this device. */ - if (is_fwnode_dev) + if (dev->fwnode && !dev->fwnode->dev) { + dev->fwnode->dev = dev; fw_devlink_link_device(dev); + } bus_probe_device(dev); if (parent) -- 2.26.2.761.g0e0b3e54be-goog