On 8/5/24 3:29 PM, Kevin Wolf wrote: > Am 16.07.2024 um 16:41 hat Andrey Drobyshev geschrieben: >> The script is basically a wrapper around "filefrag" utility. This might >> be used to map virtual offsets within the file to the underlying block >> device offsets. In addition, a chunk size might be specified, in which >> case a list of such mappings will be obtained: >> >> $ scripts/filev2p.py -s 100M /sparsefile 1768M >> 1853882368..1895825407 (file) -> 16332619776..16374562815 (/dev/sda4) -> >> 84492156928..84534099967 (/dev/sda) >> 1895825408..1958739967 (file) -> 17213591552..17276506111 (/dev/sda4) -> >> 85373128704..85436043263 (/dev/sda) >> >> This could come in handy when we need to map a certain piece of data >> within a file inside VM to the same data within the image on the host >> (e.g. physical offset on VM's /dev/sda would be the virtual offset >> within QCOW2 image). >> >> Note: as of now the script only works with the files located on plain >> partitions, i.e. it doesn't work with partitions built on top of LVM. >> Partitions on LVM would require another level of mapping. >> >> Signed-off-by: Andrey Drobyshev <[email protected]> >> --- >> scripts/filev2p.py | 311 +++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 311 insertions(+) >> create mode 100755 scripts/filev2p.py >> >> diff --git a/scripts/filev2p.py b/scripts/filev2p.py >> new file mode 100755 >> index 0000000000..3bd7d18b5e >> --- /dev/null >> +++ b/scripts/filev2p.py >> @@ -0,0 +1,311 @@ >> +#!/usr/bin/env python3 >> +# >> +# Map file virtual offset to the offset on the underlying block device. >> +# Works by parsing 'filefrag' output. >> +# >> +# Copyright (c) 2024 Virtuozzo International GmbH. >> +# >> +# This program is free software; you can redistribute it and/or modify >> +# it under the terms of the GNU General Public License as published by >> +# the Free Software Foundation; either version 2 of the License, or >> +# (at your option) any later version. >> +# >> +# This program is distributed in the hope that it will be useful, >> +# but WITHOUT ANY WARRANTY; without even the implied warranty of >> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +# GNU General Public License for more details. >> +# >> +# You should have received a copy of the GNU General Public License >> +# along with this program. If not, see <http://www.gnu.org/licenses/>. >> +# >> + >> +import argparse >> +import os >> +import subprocess >> +import re >> +import sys >> + >> +from bisect import bisect_right >> +from collections import namedtuple >> +from dataclasses import dataclass >> +from shutil import which >> +from stat import S_ISBLK >> + >> + >> +Partition = namedtuple('Partition', ['partpath', 'diskpath', 'part_offt']) >> + >> + >> +@dataclass >> +class Extent: >> + '''Class representing an individual file extent. >> + >> + This is basically a piece of data within the file which is located >> + consecutively (i.e. not sparsely) on the underlying block device. >> + ''' > > Python docstrings should always be triple double quotes """...""" as per > PEP 257. > > Some functions below even use a single single quote because they are on > a single line. They should still use the same convention. > >> + >> + log_start: int >> + log_end: int >> + phys_start: int >> + phys_end: int >> + length: int >> + partition: Partition >> + >> + @property >> + def disk_start(self): >> + 'Number of the first byte of this extent on the whole disk >> (/dev/sda)' >> + return self.partition.part_offt + self.phys_start >> + >> + @property >> + def disk_end(self): >> + 'Number of the last byte of this extent on the whole disk >> (/dev/sda)' >> + return self.partition.part_offt + self.phys_end >> + >> + def __str__(self): >> + ischunk = self.log_end > self.log_start >> + maybe_end = lambda s: f'..{s}' if ischunk else '' >> + return '%s%s (file) -> %s%s (%s) -> %s%s (%s)' % ( >> + self.log_start, maybe_end(self.log_end), >> + self.phys_start, maybe_end(self.phys_end), >> self.partition.partpath, >> + self.disk_start, maybe_end(self.disk_end), >> self.partition.diskpath >> + ) >> + >> + @classmethod >> + def ext_slice(cls, bigger_ext, start, end): >> + '''Constructor for the Extent class from a bigger extent. >> + >> + Return Extent instance which is a slice of @bigger_ext contained >> + within the range [start, end]. >> + ''' >> + >> + assert start >= bigger_ext.log_start >> + assert end <= bigger_ext.log_end >> + >> + if start == bigger_ext.log_start and end == bigger_ext.log_end: >> + return bigger_ext >> + >> + phys_start = bigger_ext.phys_start + (start - bigger_ext.log_start) >> + phys_end = bigger_ext.phys_end - (bigger_ext.log_end - end) >> + length = end - start + 1 >> + >> + return cls(start, end, phys_start, phys_end, length, >> + bigger_ext.partition) >> + >> + >> +def run_cmd(cmd: str) -> str: >> + '''Wrapper around subprocess.run. >> + >> + Returns stdout in case of success, emits en error and exits in case >> + of failure. >> + ''' >> + >> + proc = subprocess.run(cmd, stdout=subprocess.PIPE, >> stderr=subprocess.PIPE, >> + check=False, shell=True) >> + if proc.stderr is not None: >> + stderr = f'\n{proc.stderr.decode().strip()}' >> + else: >> + stderr = '' >> + >> + if proc.returncode: >> + sys.exit(f'Error: Command "{cmd}" returned >> {proc.returncode}:{stderr}') >> + >> + return proc.stdout.decode().strip() >> + >> + >> +def parse_size(offset: str) -> int: >> + 'Convert human readable size to bytes' >> + >> + suffixes = { >> + **dict.fromkeys(['k', 'K', 'Kb', 'KB', 'KiB'], 2 ** 10), >> + **dict.fromkeys(['m', 'M', 'Mb', 'MB', 'MiB'], 2 ** 20), >> + **dict.fromkeys(['g', 'G', 'Gb', 'GB', 'GiB'], 2 ** 30), >> + **dict.fromkeys( ['T', 'Tb', 'TB', 'TiB'], 2 ** 40), >> + **dict.fromkeys([''], 1) >> + } >> + >> + sizematch = re.match(r'^([0-9]+)\s*([a-zA-Z]*)$', offset) >> + if not bool(sizematch): >> + sys.exit(f'Error: Couldn\'t parse size "{offset}". Pass offset ' >> + 'either in bytes or in format 1K, 2M, 3G') >> + >> + num, suff = sizematch.groups() >> + num = int(num) >> + >> + mult = suffixes.get(suff) >> + if mult is None: >> + sys.exit(f'Error: Couldn\'t parse size "{offset}": ' >> + f'unknown suffix {suff}') >> + >> + return num * mult >> + >> + >> +def fpath2part(filename: str) -> str: >> + 'Get partition on which @filename is located (i.e. /dev/sda1).' >> + >> + partpath = run_cmd(f'df --output=source {filename} | tail -n+2') > > Anything passed to a shell (like {filename}) certainly must have proper > quoting applied to avoid shell injections? > >> + if not os.path.exists(partpath) or not >> S_ISBLK(os.stat(partpath).st_mode): >> + sys.exit(f'Error: file {filename} is located on {partpath} which ' >> + 'isn\'t a block device') >> + return partpath >> + >> + >> +def part2dev(partpath: str, filename: str) -> str: >> + 'Get block device on which @partpath is located (i.e. /dev/sda).' >> + dev = run_cmd(f'lsblk -no PKNAME {partpath}') > > Missing quoting here, too. > >> + diskpath = f'/dev/{dev}' >> + if not os.path.exists(diskpath) or not >> S_ISBLK(os.stat(diskpath).st_mode): >> + sys.exit(f'Error: file {filename} is located on {diskpath} which ' >> + 'isn\'t a block device') >> + return diskpath >> + >> + >> +def part2disktype(partpath: str) -> str: >> + 'Parse /proc/devices and get block device type for @partpath' >> + >> + major = os.major(os.stat(partpath).st_rdev) >> + assert major >> + with open('/proc/devices', encoding='utf-8') as devf: >> + for line in reversed(list(devf)): >> + # Our major cannot be absent among block devs >> + if line.startswith('Block'): >> + break >> + devmajor, devtype = line.strip().split() >> + if int(devmajor) == major: >> + return devtype >> + >> + sys.exit('Error: We haven\'t found major {major} in /proc/devices, ' >> + 'and that can\'t be') >> + >> + >> +def get_part_offset(part: str, disk: str) -> int: >> + 'Get offset in bytes of the partition @part on the block device @disk.' >> + >> + lines = run_cmd(f'fdisk -l {disk} | egrep >> "^(Units|{part})"').splitlines() > > And here. > > We should probably also match a space after {part} to avoid selecting > other partitions that have {part} as a prefix (like partition 10 when we > want partition 1). I think we would actually always get the wanted one > first, but it would be cleaner to not even have the others in the > output. > >> + >> + unitmatch = re.match('^.* = ([0-9]+) bytes$', lines[0]) >> + if not bool(unitmatch): >> + sys.exit(f'Error: Couldn\'t parse "fdisk -l" output:\n{lines[0]}') >> + secsize = int(unitmatch.group(1)) >> + >> + part_offt = int(lines[1].split()[1]) >> + return part_offt * secsize >> + >> + >> +def parse_frag_line(line: str, partition: Partition) -> Extent: >> + 'Construct Extent instance from a "filefrag" output line.' >> + >> + nums = [int(n) for n in re.findall(r'[0-9]+', line)] >> + >> + log_start = nums[1] >> + log_end = nums[2] >> + phys_start = nums[3] >> + phys_end = nums[4] >> + length = nums[5] >> + >> + assert log_start < log_end >> + assert phys_start < phys_end >> + assert (log_end - log_start + 1) == (phys_end - phys_start + 1) == >> length >> + >> + return Extent(log_start, log_end, phys_start, phys_end, length, >> partition) >> + >> + >> +def preliminary_checks(args: argparse.Namespace) -> None: >> + 'A bunch of checks to emit an error and exit at the earlier stage.' >> + >> + if which('filefrag') is None: >> + sys.exit('Error: Program "filefrag" doesn\'t exist') >> + >> + if not os.path.exists(args.filename): >> + sys.exit(f'Error: File {args.filename} doesn\'t exist') >> + >> + args.filesize = os.path.getsize(args.filename) >> + if args.offset >= args.filesize: >> + sys.exit(f'Error: Specified offset {args.offset} exceeds ' >> + f'file size {args.filesize}') >> + if args.size and (args.offset + args.size > args.filesize): >> + sys.exit(f'Error: Chunk of size {args.size} at offset ' >> + f'{args.offset} exceeds file size {args.filesize}') >> + >> + args.partpath = fpath2part(args.filename) >> + args.disktype = part2disktype(args.partpath) >> + if args.disktype not in ('sd', 'virtblk'): >> + sys.exit(f'Error: Cannot analyze files on {args.disktype} disks') >> + args.diskpath = part2dev(args.partpath, args.filename) >> + args.part_offt = get_part_offset(args.partpath, args.diskpath) >> + >> + >> +def get_extent_maps(args: argparse.Namespace) -> list[Extent]: >> + 'Run "filefrag", parse its output and return a list of Extent >> instances.' >> + >> + lines = run_cmd(f'filefrag -b1 -v {args.filename}').splitlines() > > And the final missing quoting. > >> + >> + ffinfo_re = re.compile('.* is ([0-9]+) .*of ([0-9]+) bytes') >> + ff_size, ff_block = re.match(ffinfo_re, lines[1]).groups() >> + >> + # Paranoia checks >> + if int(ff_size) != args.filesize: >> + sys.exit('Error: filefrag and os.path.getsize() report different ' >> + f'sizes: {ff_size} and {args.filesize}') >> + if int(ff_block) != 1: >> + sys.exit(f'Error: "filefrag -b1" invoked, but block size is >> {ff_block}') >> + >> + partition = Partition(args.partpath, args.diskpath, args.part_offt) >> + >> + # Fill extents list from the output >> + extents = [] >> + for line in lines: >> + if not re.match(r'^\s*[0-9]+:', line): >> + continue >> + extents += [parse_frag_line(line, partition)] >> + >> + chunk_start = args.offset >> + chunk_end = args.offset + args.size - 1 >> + ext_offsets = [ext.log_start for ext in extents] >> + start_ind = bisect_right(ext_offsets, chunk_start) - 1 >> + end_ind = bisect_right(ext_offsets, chunk_end) - 1 >> + >> + res_extents = extents[start_ind : end_ind + 1] >> + for i, ext in enumerate(res_extents): >> + start = max(chunk_start, ext.log_start) >> + end = min(chunk_end, ext.log_end) >> + res_extents[i] = Extent.ext_slice(ext, start, end) >> + >> + return res_extents >> + >> + >> +def parse_args() -> argparse.Namespace: >> + 'Define program arguments and parse user input.' >> + >> + parser = argparse.ArgumentParser(description=''' >> +Map file offset to physical offset on the block device >> + >> +With --size provided get a list of mappings for the chunk''', >> + formatter_class=argparse.RawTextHelpFormatter) >> + >> + parser.add_argument('filename', type=str, help='filename to process') >> + parser.add_argument('offset', type=str, >> + help='logical offset inside the file') >> + parser.add_argument('-s', '--size', required=False, type=str, >> + help='size of the file chunk to get offsets for') >> + args = parser.parse_args() >> + >> + args.offset = parse_size(args.offset) >> + if args.size: >> + args.size = parse_size(args.size) >> + else: >> + # When no chunk size is provided (only offset), it's equivalent to >> + # chunk size == 1 >> + args.size = 1 >> + >> + return args >> + >> + >> +def main() -> int: >> + args = parse_args() >> + preliminary_checks(args) >> + extents = get_extent_maps(args) >> + for ext in extents: >> + print(ext) >> + >> + >> +if __name__ == '__main__': >> + sys.exit(main()) > > Kevin >
Agreed on all the above comments, thank you.
