Hi Frank,
On Tue, 2020-03-24 at 16:32 -0400, Frank Ch. Eigler via Elfutils-devel wrote:
> The following patch generalizes the webapi slightly, which allows
> debuggers like lldb to operate against packages/binaries with
> source files that include posix pathname noise.
>
> commit 8ac3883f36c3c3d48bc90891aad6718a798bdd7a (HEAD -> fche/pr25548)
> Author: Frank Ch. Eigler <[email protected]>
> Date: Mon Mar 23 15:33:56 2020 -0400
>
> PR25548: support canonicalized source-path names in debuginfod webapi
>
> Programs are sometimes compiled with source path names containing
> noise like /./ or // or /foo/../, and these survive into DWARF. This
> code allows either raw or canonicalized pathnames in the webapi, by
> letting the client pass things verbatim, and letting the server
> store/accept both raw and canonicalized path names for source files.
> Tests included & docs updated.
This looks good. And seems very useful. Thanks.
Just a comment below about the documentation (and some silly whitespace).
> Signed-off-by: Frank Ch. Eigler <[email protected]>
>
> diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
> index 2410430c2361..4d687e75bdf8 100644
> --- a/debuginfod/ChangeLog
> +++ b/debuginfod/ChangeLog
> @@ -1,3 +1,14 @@
> +2020-03-24 Frank Ch. Eigler <[email protected]>
> +
> + * debuginfod-client.c (debuginfod_query_server): Set
> + CURLOPT_PATH_AS_IS, to propagate file names verbatim.
> + * debuginfod.cxx (canon_pathname): Implement RFC3986
> + style pathname canonicalization.
> + (handle_buildid): Canonicalize incoming webapi source
> + paths, accept either one.
> + (scan_source_file, archive_classify): Store both
> + original and canonicalized dwarf-source file names.
> +
> 2020-03-22 Frank Ch. Eigler <[email protected]>
>
> * debuginfod-client.c (struct debuginfod_client): Add url field.
> diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c
> index 58a04b9a734b..3d6f645d56f2 100644
> --- a/debuginfod/debuginfod-client.c
> +++ b/debuginfod/debuginfod-client.c
> @@ -681,6 +681,7 @@ debuginfod_query_server (debuginfod_client *c,
> curl_easy_setopt(data[i].handle, CURLOPT_FOLLOWLOCATION, (long) 1);
> curl_easy_setopt(data[i].handle, CURLOPT_FAILONERROR, (long) 1);
> curl_easy_setopt(data[i].handle, CURLOPT_NOSIGNAL, (long) 1);
> + curl_easy_setopt(data[i].handle, CURLOPT_PATH_AS_IS, (long) 1);
> curl_easy_setopt(data[i].handle, CURLOPT_AUTOREFERER, (long) 1);
> curl_easy_setopt(data[i].handle, CURLOPT_ACCEPT_ENCODING, "");
> add_extra_headers(data[i].handle);
> diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
> index 7c7e85eb6d14..dc62eb088c02 100644
> --- a/debuginfod/debuginfod.cxx
> +++ b/debuginfod/debuginfod.cxx
> @@ -944,6 +944,82 @@ shell_escape(const string& str)
> }
>
>
> +// PR25548: Perform POSIX / RFC3986 style path canonicalization on the input
> string.
> +//
> +// Namely:
> +// // -> /
> +// /foo/../ -> /
> +// /./ -> /
> +//
> +// This mapping is done on dwarf-side source path names, which may
> +// include these constructs, so we can deal with debuginfod clients
> +// that accidentally canonicalize the paths.
> +//
> +// realpath(3) is close but not quite right, because it also resolves
> +// symbolic links. Symlinks at the debuginfod server have nothing to
> +// do with the build-time symlinks, thus they must not be considered.
> +//
> +// see also curl Curl_dedotdotify() aka RFC3986, which we mostly follow here
> +// see also libc __realpath()
> +// see also llvm llvm::sys::path::remove_dots()
It would be good to add (part of) this to the documentation.
> +static string
> +canon_pathname (const string& input)
> +{
> + string i = input; // 5.2.4 (1)
> + string o;
> +
> + while (i.size() != 0)
> + {
> + // 5.2.4 (2) A
> + if (i.substr(0,3) == "../")
> + i = i.substr(3);
> + else if(i.substr(0,2) == "./")
> + i = i.substr(2);
> +
> + // 5.2.4 (2) B
> + else if (i.substr(0,3) == "/./")
> + i = i.substr(2);
> + else if (i == "/.")
> + i = ""; // no need to handle "/." complete-path-segment case; we're
> dealing with file names
> +
> + // 5.2.4 (2) C
> + else if (i.substr(0,4) == "/../") {
> + i = i.substr(3);
> + string::size_type sl = o.rfind("/");
> + if (sl != string::npos)
> + o = o.substr(0, sl);
> + else
> + o = "";
> + } else if (i == "/..")
> + i = ""; // no need to handle "/.." complete-path-segment case; we're
> dealing with file names
> +
> + // 5.2.4 (2) D
> + // no need to handle these cases; we're dealing with file names
> + else if (i == ".")
> + i = "";
> + else if (i == "..")
> + i = "";
> +
> + // POSIX special: map // to /
> + else if (i.substr(0,2) == "//")
> + i = i.substr(1);
> +
> + // 5.2.4 (2) E
> + else {
> + string::size_type next_slash = i.find("/", (i[0]=='/' ? 1 : 0)); //
> skip first slash
> + o += i.substr(0, next_slash);
> + if (next_slash == string::npos)
> + i = "";
> + else
> + i = i.substr(next_slash);
> + }
> + }
> +
> + return o;
> +}
> +
> +
> +
> // A map-like class that owns a cache of file descriptors (indexed by
> // file / content names).
> //
> @@ -1393,12 +1469,17 @@ static struct MHD_Response* handle_buildid (const
> string& buildid /* unsafe */,
> }
> else if (atype_code == "S")
> {
> + // PR25548
> + // Incoming source queries may come in with either dwarf-level OR
> canonicalized paths.
> + // We let the query pass with either one.
> +
> pp = new sqlite_ps (db, "mhd-query-s",
> - "select mtime, sourcetype, source0, source1 from "
> BUILDIDS "_query_s where buildid = ? and artifactsrc = ? "
> + "select mtime, sourcetype, source0, source1 from "
> BUILDIDS "_query_s where buildid = ? and artifactsrc in (?,?) "
> "order by sharedprefix(source0,source0ref) desc,
> mtime desc");
> pp->reset();
> pp->bind(1, buildid);
> pp->bind(2, suffix);
> + pp->bind(3, canon_pathname(suffix));
> }
> unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return
>
> @@ -2083,6 +2164,27 @@ scan_source_file (const string& rps, const stat_t& st,
> .bind(4, sfs.st_mtime)
> .step_ok_done();
>
> + // PR25548: also store canonicalized source path
> + string dwarfsrc_canon = canon_pathname (dwarfsrc);
> + if (dwarfsrc_canon != dwarfsrc)
> + {
> + if (verbose > 3)
> + obatched(clog) << "canonicalized src=" << dwarfsrc << "
> alias=" << dwarfsrc_canon << endl;
> +
> + ps_upsert_files
> + .reset()
> + .bind(1, dwarfsrc_canon)
> + .step_ok_done();
> +
> + ps_upsert_s
> + .reset()
> + .bind(1, buildid)
> + .bind(2, dwarfsrc_canon)
> + .bind(3, srps)
> + .bind(4, sfs.st_mtime)
> + .step_ok_done();
> + }
> +
> inc_metric("found_sourcerefs_total","source","files");
> }
> }
> @@ -2244,6 +2346,26 @@ archive_classify (const string& rps, string&
> archive_extension,
> .bind(2, s)
> .step_ok_done();
>
> + // PR25548: also store canonicalized source path
> + const string& dwarfsrc = s;
> + string dwarfsrc_canon = canon_pathname (dwarfsrc);
> + if (dwarfsrc_canon != dwarfsrc)
> + {
> + if (verbose > 3)
> + obatched(clog) << "canonicalized src=" << dwarfsrc
> << " alias=" << dwarfsrc_canon << endl;
> +
> + ps_upsert_files
> + .reset()
> + .bind(1, dwarfsrc_canon)
> + .step_ok_done();
> +
> + ps_upsert_sref
> + .reset()
> + .bind(1, buildid)
> + .bind(2, dwarfsrc_canon)
> + .step_ok_done();
> + }
> +
> fts_sref ++;
> }
> }
Looks good.
> diff --git a/doc/ChangeLog b/doc/ChangeLog
> index 59809ea8a1e2..564644f41907 100644
> --- a/doc/ChangeLog
> +++ b/doc/ChangeLog
> @@ -1,3 +1,8 @@
> +2020-03-24 Frank Ch. Eigler <[email protected]>
> +
> + * debuginfod-find.1, debuginfod_find_debuginfo.3: Document
> + source path canonicalization.
> +
> 2020-03-22 Frank Ch. Eigler <[email protected]>
>
> * debuginfod_get_url.3: New function, documented ...
> diff --git a/doc/debuginfod-find.1 b/doc/debuginfod-find.1
> index e71ca29be96e..10c609f2c18e 100644
> --- a/doc/debuginfod-find.1
> +++ b/doc/debuginfod-find.1
> @@ -78,10 +78,9 @@ is made up of multiple CUs. Therefore, to disambiguate,
> debuginfod
> expects source queries to prefix relative path names with the CU
> compilation-directory, followed by a mandatory "/".
>
> -Note: the user should not elide \fB../\fP or \fB/./\fP or extraneous
> -\fB///\fP sorts of path components in the directory names, because if
> -this is how those names appear in the DWARF files, that is what
> -debuginfod needs to see too.
> +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous
> +\fB///\fP sorts of path components in the directory names. debuginfod
> +accepts both forms.
>
> For example:
> .TS
> diff --git a/doc/debuginfod_find_debuginfo.3 b/doc/debuginfod_find_debuginfo.3
> index f7ec6cc134ba..48846119b3aa 100644
> --- a/doc/debuginfod_find_debuginfo.3
> +++ b/doc/debuginfod_find_debuginfo.3
> @@ -87,10 +87,9 @@ paths, and yet an executable is made up of multiple CUs.
> Therefore, to
> disambiguate, debuginfod expects source queries to prefix relative path
> names with the CU compilation-directory, followed by a mandatory "/".
>
> -Note: the caller should not elide \fB../\fP or \fB/./\fP or extraneous
> -\fB///\fP sorts of path components in the directory names, because if
> -this is how those names appear in the DWARF files, that is what
> -debuginfod needs to see too.
> +Note: the caller may or may not elide \fB../\fP or \fB/./\fP or extraneous
> +\fB///\fP sorts of path components in the directory names. debuginfod
> +accepts both forms.
>
> If \fIpath\fP is not NULL and the query is successful, \fIpath\fP is set
> to the path of the file in the cache. The caller must \fBfree\fP() this
> value.
Could you add something that describes the exact form of
canonicalization done? Maybe just:
"Canonicalization is done according to RFC3986 section 5.2.4 (Remove
Dot Segments), plus reducing any '//' to '/' in the path."
> diff --git a/tests/ChangeLog b/tests/ChangeLog
> index cefbceb4f3a8..47b40f1c18a9 100644
> --- a/tests/ChangeLog
> +++ b/tests/ChangeLog
> @@ -1,3 +1,9 @@
> +2020-03-24 Frank Ch. Eigler <[email protected]>
> +
> + * debuginfod-rpms/hello3.spec., /fedora31/*: New files with
> + uncanonicalized source paths.
> + * run-debuginfod-find.sh: Test them.
> +
> 2020-03-22 Frank Ch. Eigler <[email protected]>
>
> * run-debuginfod-find.sh: Look for URL in default progressfn
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm
> new file mode 100644
> index 000000000000..d0b3454083d8
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.src.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..8b2fe9bbd9e7
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..ee479ecb4909
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-debuginfo-1.0-2.x86_64.rpm differ
> diff --git
> a/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..890478e429ab
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-debugsource-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..73fd939d1597
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-two-1.0-2.x86_64.rpm differ
> diff --git
> a/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm
> b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm
> new file mode 100644
> index 000000000000..0cc24073aa3e
When using git format-patch instead of git show you'll get something
that people can apply and that shows how big the files are. I assume
they are small? The existing ones are between the 4K and 12K.
> Binary files /dev/null and
> b/tests/debuginfod-rpms/fedora31/hello3-two-debuginfo-1.0-2.x86_64.rpm differ
> diff --git a/tests/debuginfod-rpms/hello3.spec.
> b/tests/debuginfod-rpms/hello3.spec.
> new file mode 100644
> index 000000000000..ffb95134e2b6
> --- /dev/null
> +++ b/tests/debuginfod-rpms/hello3.spec.
> @@ -0,0 +1,60 @@
> +Summary: hello3 -- double hello, world rpm
> +Name: hello3
> +Version: 1.0
> +Release: 2
> +Group: Utilities
> +License: GPL
> +Distribution: RPM ^W Elfutils test suite.
> +Vendor: Red Hat Software
> +Packager: Red Hat Software <[email protected]>
> +URL: http://www.redhat.com
> +BuildRequires: gcc make
> +Source0: hello-1.0.tar.gz
> +
> +%description
> +Simple rpm demonstration with an eye to consumption by debuginfod.
> +
> +%package two
> +Summary: hello3two
> +License: GPL
> +
> +%description two
> +Dittoish.
> +
> +%prep
> +%setup -q -n hello-1.0
> +
> +%build
> +mkdir foobar
> +gcc -g -O1 foobar///./../hello.c -o hello
> +gcc -g -O2 -D_FORTIFY_SOURCE=2 foobar///./../hello.c -o hello3
> +
> +%install
> +rm -rf $RPM_BUILD_ROOT
> +mkdir -p $RPM_BUILD_ROOT/usr/local/bin
> +cp hello $RPM_BUILD_ROOT/usr/local/bin/
> +cp hello3 $RPM_BUILD_ROOT/usr/local/bin/
> +
> +%clean
> +rm -rf $RPM_BUILD_ROOT
> +
> +%files
> +%defattr(-,root,root)
> +%attr(0751,root,root) /usr/local/bin/hello
> +
> +%files two
> +%defattr(-,root,root)
> +%attr(0751,root,root) /usr/local/bin/hello3
> +
> +%changelog
> +* Tue Mar 24 2020 Frank Ch. Eigler <[email protected]>
> +- New variant of hello2, with crazy source file paths
> +
> +* Thu Nov 14 2019 Frank Ch. Eigler <[email protected]>
> +- Dropped misc files not relevant to debuginfod testing.
> +
> +* Wed May 18 2016 Mark Wielaard <[email protected]>
> +- Add hello2 for dwz testing support.
> +
> +* Tue Oct 20 1998 Jeff Johnson <[email protected]>
> +- create.
> diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
> index b64130282d86..e0a9beb9c11c 100755
> --- a/tests/run-debuginfod-find.sh
> +++ b/tests/run-debuginfod-find.sh
> @@ -23,8 +23,8 @@ type rpm2cpio 2>/dev/null || (echo "need rpm2cpio"; exit 77)
> type bzcat 2>/dev/null || (echo "need bzcat"; exit 77)
>
> # for test case debugging, uncomment:
> -# set -x
> -# VERBOSE=-vvvv
> +#set -x
> +#VERBOSE=-vvvv
whitespace change?
> DB=${PWD}/.debuginfod_tmp.sqlite
> tempfiles $DB
> @@ -40,7 +40,7 @@ cleanup()
> if [ $PID2 -ne 0 ]; then kill $PID2; wait $PID2; fi
> if [ $PID3 -ne 0 ]; then kill $PID3; wait $PID3; fi
>
> - rm -rf F R D L Z ${PWD}/mocktree ${PWD}/.client_cache* ${PWD}/tmp*
> + rm -rf F R D L Z ${PWD}/foobar ${PWD}/mocktree ${PWD}/.client_cache*
> ${PWD}/tmp*
> exit_cleanup
> }
>
> @@ -112,7 +112,9 @@ export DEBUGINFOD_TIMEOUT=10
> # cannot find it without debuginfod.
> echo "int main() { return 0; }" > ${PWD}/prog.c
> tempfiles prog.c
> -gcc -Wl,--build-id -g -o prog ${PWD}/prog.c
> +# Create a subdirectory to confound source path names
> +mkdir foobar
> +gcc -Wl,--build-id -g -o prog ${PWD}/foobar///./../prog.c
> testrun ${abs_top_builddir}/src/strip -g -f prog.debug ${PWD}/prog
> BUILDID=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \
> -a prog | grep 'Build ID' | cut -d ' ' -f 7`
> @@ -142,9 +144,15 @@ cmp $filename F/prog.debug
> filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable
> $BUILDID`
> cmp $filename F/prog
>
> +# raw source filename
> +filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source
> $BUILDID ${PWD}/foobar///./../prog.c`
> +cmp $filename ${PWD}/prog.c
> +
> +# and also the canonicalized one
> filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find source
> $BUILDID ${PWD}/prog.c`
> cmp $filename ${PWD}/prog.c
>
> +
> ########################################################################
And another?
> # Test whether the cache default locations are correct
> @@ -291,6 +299,9 @@ archive_test() {
>
> # common source file sha1
> SHA=f4a1a8062be998ae93b8f1cd744a398c6de6dbb1
> +# fedora31
> +archive_test 420e9e3308971f4b817cc5bf83928b41a6909d88
> /usr/src/debug/hello3-1.0-2.x86_64/foobar////./../hello.c $SHA
> +archive_test 87c08d12c78174f1082b7c888b3238219b0eb265
> /usr/src/debug/hello3-1.0-2.x86_64///foobar/./..//hello.c $SHA
> # fedora30
> archive_test c36708a78618d597dee15d0dc989f093ca5f9120
> /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA
> archive_test 41a236eb667c362a1c4196018cc4581e09722b1b
> /usr/src/debug/hello2-1.0-2.x86_64/hello.c $SHA
Nice testcases.
Thanks,
Mark