On Fri, Nov 19, 2021 at 2:14 AM 胡 玮文 <[email protected]> wrote:
>
> Thanks Dan,
>
> I choose one of the stuck client to investigate, as shown below, it currently
> holds ~269700 caps, which is pretty high with no obvious reason. I cannot
> understand most of the output, and failed to find any documents about it.
>
> # ceph tell mds.cephfs.gpu018.ovxvoz client ls id=7915658
> [
> {
> "id": 7915658,
> "entity": {
> "name": {
> "type": "client",
> "num": 7915658
> },
> "addr": {
> "type": "v1",
> "addr": "202.38.247.227:0",
> "nonce": 3019311016
> }
> },
> "state": "open",
> "num_leases": 0,
> "num_caps": 269695,
> "request_load_avg": 184,
> "uptime": 1340483.111458218,
> "requests_in_flight": 0,
> "num_completed_requests": 0,
> "num_completed_flushes": 1,
> "reconnecting": false,
> "recall_caps": {
> "value": 1625220.0378812221,
> "halflife": 60
> },
> "release_caps": {
> "value": 69.432671270941171,
> "halflife": 60
> },
> "recall_caps_throttle": {
> "value": 63255.667075845187,
> "halflife": 1.5
> },
> "recall_caps_throttle2o": {
> "value": 26064.679002183591,
> "halflife": 0.5
> },
> "session_cache_liveness": {
> "value": 259.9718480278375,
> "halflife": 300
> },
The MDS considers your client to be quiescent so it's asking it to
release caps. However it's not doing so. This may be a bug in the
kernel client.
> "cap_acquisition": {
> "value": 0,
> "halflife": 10
> },
> "delegated_inos": [... 7 items removed ],
> "inst": "client.7915658 v1:202.38.247.227:0/3019311016",
> "completed_requests": [],
> "prealloc_inos": [ ... 9 items removed ],
> "client_metadata": {
> "client_features": {
> "feature_bits": "0x0000000000007bff"
> },
> "metric_spec": {
> "metric_flags": {
> "feature_bits": "0x000000000000001f"
> }
> },
> "entity_id": "smil",
> "hostname": "gpu027",
> "kernel_version": "5.11.0-37-generic",
> "root": "/"
> }
> }
> ]
>
> I suspect that some files are in use so that their caps cannot be released.
> However, "sudo lsof +f -- /mnt/cephfs | wc -l" just shows about 9k open
> files, well below "num_caps".
>
> I also looked at
> /sys/kernel/debug/ceph/e88d509a-f6fc-11ea-b25d-a0423f3ac864.client7915658/caps
> on the client. The number of lines in it matches the "num_caps" reported by
> MDS. This file also tells me which caps are not released. I investigated some
> of them, but cannot see anything special. One example is attached here.
>
> # ceph tell mds.cephfs.gpu018.ovxvoz dump inode 0x100068b9d24
> {
> "path": "/dataset/coco2017/train2017/000000342643.jpg",
> "ino": 1099621440804,
> "rdev": 0,
> "ctime": "2021-04-23T09:49:54.433652+0000",
> "btime": "2021-04-23T09:49:54.425652+0000",
> "mode": 33204,
> "uid": 859600009,
> "gid": 859600009,
> "nlink": 1,
> "dir_layout": {
> "dir_hash": 0,
> "unused1": 0,
> "unused2": 0,
> "unused3": 0
> },
> "layout": {
> "stripe_unit": 4194304,
> "stripe_count": 1,
> "object_size": 4194304,
> "pool_id": 5,
> "pool_ns": ""
> },
> "old_pools": [],
> "size": 147974,
> "truncate_seq": 1,
> "truncate_size": 18446744073709551615,
> "truncate_from": 0,
> "truncate_pending": 0,
> "mtime": "2021-04-23T09:49:54.433652+0000",
> "atime": "2021-04-23T09:49:54.425652+0000",
> "time_warp_seq": 0,
> "change_attr": 1,
> "export_pin": -1,
> "export_ephemeral_random_pin": 0,
> "export_ephemeral_distributed_pin": false,
> "client_ranges": [],
> "dirstat": {
> "version": 0,
> "mtime": "0.000000",
> "num_files": 0,
> "num_subdirs": 0,
> "change_attr": 0
> },
> "rstat": {
> "version": 0,
> "rbytes": 147974,
> "rfiles": 1,
> "rsubdirs": 0,
> "rsnaps": 0,
> "rctime": "2021-04-23T09:49:54.433652+0000"
> },
> "accounted_rstat": {
> "version": 0,
> "rbytes": 147974,
> "rfiles": 1,
> "rsubdirs": 0,
> "rsnaps": 0,
> "rctime": "2021-04-23T09:49:54.433652+0000"
> },
> "version": 182894,
> "file_data_version": 0,
> "xattr_version": 1,
> "backtrace_version": 177717,
> "stray_prior_path": "",
> "max_size_ever": 0,
> "quota": {
> "max_bytes": 0,
> "max_files": 0
> },
> "last_scrub_stamp": "0.000000",
> "last_scrub_version": 0,
> "symlink": "",
> "xattrs": [],
> "dirfragtree": {
> "splits": []
> },
> "old_inodes": [],
> "oldest_snap": 18446744073709551614,
> "damage_flags": 0,
> "is_auth": true,
> "auth_state": {
> "replicas": {}
> },
> "replica_state": {
> "authority": [
> 0,
> -2
> ],
> "replica_nonce": 0
> },
> "auth_pins": 0,
> "is_frozen": false,
> "is_freezing": false,
> "pins": {
> "caps": 1
> },
> "nref": 1,
> "versionlock": {
> "gather_set": [],
> "state": "lock",
> "is_leased": false,
> "num_rdlocks": 0,
> "num_wrlocks": 0,
> "num_xlocks": 0,
> "xlock_by": {}
> },
> "authlock": {},
> "linklock": {},
> "dirfragtreelock": {},
> "filelock": {},
> "xattrlock": {},
> "snaplock": {},
> "nestlock": {},
> "flocklock": {},
> "policylock": {},
> "states": [
> "auth"
> ],
> "client_caps": [
> {
> "client_id": 7915658,
> "pending": "pAsLsXsFscr",
> "issued": "pAsLsXsFscr",
> "wanted": "-",
> "last_sent": 1
> }
> ],
> "loner": -1,
> "want_loner": -1,
> "mds_caps_wanted": []
> }
>
> I also did an experiment, I executed "find > /dev/null" in one directory to
> acquire some caps. As expected "num_caps" quickly increased to over 1M. But
> after around an hour, it dropped back to about 269700. So new caps are
> released as expected, old caps are still not released.
>
> It seems I need to find out why the client don't want to release some
> specific caps.
Double-check you don't have any other configurations you have not
mentioned. Make sure the local ceph.conf for the MDS is clean
(nowadays you should do all configuration via `ceph config set ...`).
--
Patrick Donnelly, Ph.D.
He / Him / His
Principal Software Engineer
Red Hat, Inc.
GPG: 19F28A586F808C2402351B93C3301A3E258DD79D
_______________________________________________
ceph-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]