D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread Raphaël Gomès
Alphare added inline comments.

INLINE COMMENTS

> martinvonz wrote in path_auditor.rs:191
> This test now fails. I'll fix it.

Right, sorry!

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread martinvonz (Martin von Zweigbergk)
martinvonz added inline comments.

INLINE COMMENTS

> path_auditor.rs:191
> +auditor.audit_path(path),
> +Err(HgPathError::ContainsIllegalComponent(path.to_owned()))
> +);

This test now fails. I'll fix it.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread Raphaël Gomès
Closed by commit rHGa540e96b9029: rust-pathauditor: add Rust implementation of 
the `pathauditor` (authored by Alphare).
This revision was automatically updated to reflect the committed changes.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19934&id=19954

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,230 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::lower_clean,
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+
+let first_component = lower_clean(parts[0]);
+let first_component = first_component.as_slice();
+if !path.split_drive().0.is_empty()
+|| (first_component == b".hg"
+|| first_component == b".hg."
+|| first_component == b"")
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::InsideDotHg(path.to_owned()));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let first =
+split.next().unwrap().to_owned().to_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(u8::is_ascii_digit)
+&& (first == b"HG" || first == b"HG8B6C")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+let lower_path = lower_clean(path.as_bytes());
+if find_slice_in_slice(&lower_path, b".hg").is_some() {
+let lower_parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if let Some(pos) = lower_parts[1..]
+.iter()
+.position(|part| part == &pattern.as_slice())
+{
+let base = lower_parts[..=pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+// We don't want to add "foo/bar/baz" to `audited_dirs` before checking
+// if there's a "foo/.hg" directory. This also means we won't
+// accidentally traverse a symlink into some other filesystem (which
+// is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix = &parts[..index + 1].join

D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread martinvonz (Martin von Zweigbergk)
This revision is now accepted and ready to land.
martinvonz added inline comments.
martinvonz accepted this revision.

INLINE COMMENTS

> martinvonz wrote in path_auditor.rs:86
> Well, you already have different errors for `InsideDotHg` and 
> `IsInsideNestedRepo`, so I think you're good. I was just trying to explain 
> the reason for the `1` there in the code (which I assume you just copied from 
> Python). Sorry about the confusion.

Oh, I only now looked at the diff from the previous review and you just added 
`InsideDotHg`. So my comment was useful then and you were not confused -- I 
was. Thanks for fixing :)

REPOSITORY
  rHG Mercurial

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread martinvonz (Martin von Zweigbergk)
martinvonz added inline comments.

INLINE COMMENTS

> Alphare wrote in path_auditor.rs:86
> Good idea. I'm rewriting my patches, I will soon send an update for all of 
> them.

Well, you already have different errors for `InsideDotHg` and 
`IsInsideNestedRepo`, so I think you're good. I was just trying to explain the 
reason for the `1` there in the code (which I assume you just copied from 
Python). Sorry about the confusion.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread Raphaël Gomès
Alphare updated this revision to Diff 19934.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19544&id=19934

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,230 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::lower_clean,
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+
+let first_component = lower_clean(parts[0]);
+let first_component = first_component.as_slice();
+if !path.split_drive().0.is_empty()
+|| (first_component == b".hg"
+|| first_component == b".hg."
+|| first_component == b"")
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::InsideDotHg(path.to_owned()));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let first =
+split.next().unwrap().to_owned().to_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(u8::is_ascii_digit)
+&& (first == b"HG" || first == b"HG8B6C")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+let lower_path = lower_clean(path.as_bytes());
+if find_slice_in_slice(&lower_path, b".hg").is_some() {
+let lower_parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if let Some(pos) = lower_parts[1..]
+.iter()
+.position(|part| part == &pattern.as_slice())
+{
+let base = lower_parts[..=pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+// We don't want to add "foo/bar/baz" to `audited_dirs` before checking
+// if there's a "foo/.hg" directory. This also means we won't
+// accidentally traverse a symlink into some other filesystem (which
+// is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix = &parts[..index + 1].join(&b'/');
+let prefix = HgPath::new(prefix);
+if self.audited_dirs.contains(prefix) {
+contin

D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread Raphaël Gomès
Alphare added inline comments.

INLINE COMMENTS

> martinvonz wrote in path_auditor.rs:86
> Sorry, I just meant that we do it this way so that we can provide different 
> error messages for "inside .hg" and "inside subrepo" :)

Good idea. I'm rewriting my patches, I will soon send an update for all of them.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread martinvonz (Martin von Zweigbergk)
martinvonz added inline comments.

INLINE COMMENTS

> Alphare wrote in path_auditor.rs:86
> I see. What error message do you propose?

Sorry, I just meant that we do it this way so that we can provide different 
error messages for "inside .hg" and "inside subrepo" :)

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-02-06 Thread Raphaël Gomès
Alphare added inline comments.

INLINE COMMENTS

> martinvonz wrote in path_auditor.rs:86
> Since `HgRepoPath` is a path that's stored in the repo, I don't think so. Oh, 
> we just want to provide a different error message so we handle that further 
> up (lines 60-61).

I see. What error message do you propose?

> martinvonz wrote in path_auditor.rs:57-58
> nit 1: inline the first `first_component` rather than redefining it on the 
> next line?
> nit 2: does `as_slice()` better convey what the second line does?

nit 1: that creates a temporary variable that is instantly freed, so no luck 
there

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-24 Thread martinvonz (Martin von Zweigbergk)
martinvonz added inline comments.

INLINE COMMENTS

> Alphare wrote in path_auditor.rs:86
> It would be valid to have a path inside the root `.hg` wouldn't it?

Since `HgRepoPath` is a path that's stored in the repo, I don't think so. Oh, 
we just want to provide a different error message so we handle that further up 
(lines 60-61).

> Alphare wrote in path_auditor.rs:123-124
> I think the original comment mentions the logic within this function, not 
> later uses of the PathAuditor. But maybe I misunderstood your question.

I've sent D8002  to fix the Python side. 
Can you update this to match (assuming I'm right, of course)?

> path_auditor.rs:57-58
> +
> +let first_component = lower_clean(parts[0]);
> +let first_component = first_component.deref();
> +if !path.split_drive().0.is_empty()

nit 1: inline the first `first_component` rather than redefining it on the next 
line?
nit 2: does `as_slice()` better convey what the second line does?

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-24 Thread Raphaël Gomès
Alphare updated this revision to Diff 19544.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19351&id=19544

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,238 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::lower_clean,
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+use std::ops::Deref;
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+
+let first_component = lower_clean(parts[0]);
+let first_component = first_component.deref();
+if !path.split_drive().0.is_empty()
+|| (first_component == b".hg"
+|| first_component == b".hg."
+|| first_component == b"")
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let first =
+split.next().unwrap().to_owned().to_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(u8::is_ascii_digit)
+&& (first == b"HG" || first == b"HG8B6C")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+let lower_path = lower_clean(path.as_bytes());
+if find_slice_in_slice(&lower_path, b".hg").is_some() {
+let lower_parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if let Some(pos) =
+lower_parts[1..].iter().position(|part| part == 
&pattern.deref())
+{
+let base = lower_parts[..pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+let mut prefixes = vec![];
+
+// It's important that we check the path parts starting from the root.
+// This means we won't accidentally traverse a symlink into some other
+// filesystem (which is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix =
+&parts[..index + 1].join(&b'/');
+let prefix = HgPath::new(prefix);
+if self.audited_dirs.

D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-24 Thread Raphaël Gomès
Alphare added inline comments.
Alphare marked an inline comment as done.

INLINE COMMENTS

> martinvonz wrote in path_auditor.rs:55
> As @yuja pointed out on D7864 , it's 
> weird to have `split_drive()` on `HgPath`, because that is supposed to be a 
> repo-relative path. I think it would be better to move it into this file (or 
> elsewhere). Can be done in a follow-up.

Yes, it should just act on bytes and be a separate function in `utils/files.rs`.

> martinvonz wrote in path_auditor.rs:72
> or is `u8::is_ascii_digit` preferred? (just a question; i don't know which is 
> preferred)

I like the point-free approach, I don't often remember to use it, since most of 
the type iterator adapters are not that trivial.

> martinvonz wrote in path_auditor.rs:73
> Is this line equivalent to `&& (first == b"HG" || first == b"HG8B6C")`? If 
> so, I would strongly prefer that.
> 
> Same comment on line 56, actually (I reviewed out of of order). Would 
> probably be helpful to extract `&lower_clean(parts[0]).as_ref()` to a 
> variable anyway.

Much clearer indeed, I don't know what I was thinking.

> martinvonz wrote in path_auditor.rs:83-84
> Would we ever get different results from `lower_clean(path).split()` and 
> `split(path).map(lower_clean)`? If not, shouldn't we reuse the 
> `lower_clean()`ed result from above and call `split()` on that?

I don't think that could ever happen, since the `/` is left intact 
`lower-clean` and no weird multi-byte shenanigans would arise.

> martinvonz wrote in path_auditor.rs:86
> What's the `1` about? It seems to say it's okay if the path has `.hg` as its 
> first component. Why?

It would be valid to have a path inside the root `.hg` wouldn't it?

> martinvonz wrote in path_auditor.rs:87-90
> Can we eliminate this by using `.enumerate()` on line 85?

Used pattern matching instead of unwrapping.

> martinvonz wrote in path_auditor.rs:91-95
> Will it be confusing that this is not necessarily a prefix of `path`? 
> Specifically, it seems like it's going to be lower-case and with possibly 
> different path separators.

I wondered when looking at the Python implementation, but I think this has the 
added value of telling the user what the path was transformed to. Maybe in a 
follow-up we could change add the original path in the error message... but I'm 
not sure it's worth the hassle.

> martinvonz wrote in path_auditor.rs:114
> Should this be a `PathBuf`? It's weird to have a `HgPath` containing 
> `std::path::MAIN_SEPARATOR`.

It has to be an `HgPath`, but I agree that I should just use a plain `b'/'`.

> martinvonz wrote in path_auditor.rs:123-124
> It looks like we check the prefixes in order, so first "foo", then "foo/bar", 
> then "foo/bar/baz". We'd return early if we find "foo/.hg", before we get to 
> "foo/bar", no? What am I missing?

I think the original comment mentions the logic within this function, not later 
uses of the PathAuditor. But maybe I misunderstood your question.

> martinvonz wrote in path_auditor.rs:144
> Could you replace "for" by "because" if that's what it means here. Or fix the 
> sentence if that's not what you mean.

This entire sentence is confusing and the first one is useful enough. I copied 
it over, but it's simpler without it.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-23 Thread martinvonz (Martin von Zweigbergk)
This revision now requires changes to proceed.
martinvonz added inline comments.
martinvonz requested changes to this revision.
martinvonz added subscribers: yuja, martinvonz.

INLINE COMMENTS

> path_auditor.rs:55
> +.collect();
> +if !path.split_drive().0.is_empty()
> +|| [&b".hg"[..], &b".hg."[..], &b""[..]]

As @yuja pointed out on D7864 , it's 
weird to have `split_drive()` on `HgPath`, because that is supposed to be a 
repo-relative path. I think it would be better to move it into this file (or 
elsewhere). Can be done in a follow-up.

> path_auditor.rs:70
> +let mut first = split.next().unwrap().to_owned();
> +first.make_ascii_uppercase();
> +let last = split.next().unwrap();

nit: chain `to_ascii_uppercase()` onto the previous line (and drop `mut`) 
instead

> path_auditor.rs:72
> +let last = split.next().unwrap();
> +if last.iter().all(|b| b.is_ascii_digit())
> +&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())

or is `u8::is_ascii_digit` preferred? (just a question; i don't know which is 
preferred)

> path_auditor.rs:73
> +if last.iter().all(|b| b.is_ascii_digit())
> +&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())
> +{

Is this line equivalent to `&& (first == b"HG" || first == b"HG8B6C")`? If so, 
I would strongly prefer that.

Same comment on line 56, actually (I reviewed out of of order). Would probably 
be helpful to extract `&lower_clean(parts[0]).as_ref()` to a variable anyway.

> path_auditor.rs:83-84
> +{
> +let lower_parts: Vec<_> =
> +parts.iter().map(|p| lower_clean(p)).collect();
> +for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {

Would we ever get different results from `lower_clean(path).split()` and 
`split(path).map(lower_clean)`? If not, shouldn't we reuse the 
`lower_clean()`ed result from above and call `split()` on that?

> path_auditor.rs:86
> +for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
> +if lower_parts[1..].contains(pattern) {
> +let pos = lower_parts

What's the `1` about? It seems to say it's okay if the path has `.hg` as its 
first component. Why?

> path_auditor.rs:87-90
> +let pos = lower_parts
> +.iter()
> +.position(|part| part == pattern)
> +.unwrap();

Can we eliminate this by using `.enumerate()` on line 85?

> path_auditor.rs:91-95
> +let base = lower_parts[..pos]
> +.iter()
> +.fold(HgPathBuf::new(), |acc, p| {
> +acc.join(HgPath::new(p))
> +});

Will it be confusing that this is not necessarily a prefix of `path`? 
Specifically, it seems like it's going to be lower-case and with possibly 
different path separators.

> path_auditor.rs:114
> +&parts[..index + 1].join(&(std::path::MAIN_SEPARATOR as u8));
> +let prefix = HgPath::new(prefix);
> +if self.audited_dirs.contains(prefix) {

Should this be a `PathBuf`? It's weird to have a `HgPath` containing 
`std::path::MAIN_SEPARATOR`.

> path_auditor.rs:123-124
> +self.audited.insert(path.to_owned());
> +// Only add prefixes to the cache after checking everything: we don't
> +// want to add "foo/bar/baz" before checking if there's a "foo/.hg"
> +self.audited_dirs.extend(prefixes);

It looks like we check the prefixes in order, so first "foo", then "foo/bar", 
then "foo/bar/baz". We'd return early if we find "foo/.hg", before we get to 
"foo/bar", no? What am I missing?

> path_auditor.rs:144
> +// EINVAL can be raised as invalid path syntax under win32.
> +// They must be ignored for patterns can be checked too.
> +if e.kind() != std::io::ErrorKind::NotFound

Could you replace "for" by "because" if that's what it means here. Or fix the 
sentence if that's not what you mean.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox, martinvonz
Cc: martinvonz, yuja, durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-16 Thread Raphaël Gomès
Alphare updated this revision to Diff 19351.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19317&id=19351

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,235 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::lower_clean,
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+if !path.split_drive().0.is_empty()
+|| [&b".hg"[..], &b".hg."[..], &b""[..]]
+.contains(&lower_clean(parts[0]).as_ref())
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let mut first = split.next().unwrap().to_owned();
+first.make_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(|b| b.is_ascii_digit())
+&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+if find_slice_in_slice(&lower_clean(path.as_bytes()), b".hg").is_some()
+{
+let lower_parts: Vec<_> =
+parts.iter().map(|p| lower_clean(p)).collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if lower_parts[1..].contains(pattern) {
+let pos = lower_parts
+.iter()
+.position(|part| part == pattern)
+.unwrap();
+let base = lower_parts[..pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+let mut prefixes = vec![];
+
+// It's important that we check the path parts starting from the root.
+// This means we won't accidentally traverse a symlink into some other
+// filesystem (which is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix =
+&parts[..index + 1].join(&(std::path::MAIN_SEPARATOR as u8));
+let prefix = HgPath::new(prefix);
+if self.audited_dirs.contains(prefix) {
+continue;
+}
+self.check_filesystem(&prefix

D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-15 Thread Raphaël Gomès
Alphare updated this revision to Diff 19317.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19258&id=19317

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,235 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::{lower_clean, split_drive},
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|b| std::path::is_separator(*b as char))
+.collect();
+if !split_drive(path).0.is_empty()
+|| [&b".hg"[..], &b".hg."[..], &b""[..]]
+.contains(&lower_clean(parts[0]).as_ref())
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let mut first = split.next().unwrap().to_owned();
+first.make_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(|b| b.is_ascii_digit())
+&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+if find_slice_in_slice(&lower_clean(path.as_bytes()), b".hg").is_some()
+{
+let lower_parts: Vec<_> =
+parts.iter().map(|p| lower_clean(p)).collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if lower_parts[1..].contains(pattern) {
+let pos = lower_parts
+.iter()
+.position(|part| part == pattern)
+.unwrap();
+let base = lower_parts[..pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+let mut prefixes = vec![];
+
+// It's important that we check the path parts starting from the root.
+// This means we won't accidentally traverse a symlink into some other
+// filesystem (which is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix =
+&parts[..index + 1].join(&(std::path::MAIN_SEPARATOR as u8));
+let prefix = HgPath::new(prefix);
+if self.audited_dirs.contains(prefix) {
+continue;
+}
+self.check_file

D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-15 Thread Raphaël Gomès
Alphare added inline comments.
Alphare marked 2 inline comments as done.

INLINE COMMENTS

> kevincox wrote in path_auditor.rs:54
> It would be nice to have this in a helper function in path to get a component 
> iterator.

I think that's a good idea indeed, but I would prefer to do it in another patch.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-15 Thread kevincox (Kevin Cox)
kevincox added inline comments.
kevincox accepted this revision.

INLINE COMMENTS

> path_auditor.rs:53
> +.as_bytes()
> +.split(|c| *c as char == std::path::MAIN_SEPARATOR)
> +.collect();

Should this be `std::path::is_separator(*c as char)`?.

If not please add a comment explaining why.

> path_auditor.rs:54
> +.split(|c| *c as char == std::path::MAIN_SEPARATOR)
> +.collect();
> +if !split_drive(path).0.is_empty()

It would be nice to have this in a helper function in path to get a component 
iterator.

> path_auditor.rs:72
> +let last = split.next().unwrap();
> +if last.iter().all(|b| (*b as char).is_digit(10))
> +&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())

You can just use 
https://doc.rust-lang.org/std/primitive.u8.html#method.is_ascii_digit

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

To: Alphare, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

2020-01-14 Thread Raphaël Gomès
Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  It does not offer the same flexibility as the Python implementation, but
  should check incoming paths just as well.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs 
b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,235 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès ,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+files::{lower_clean, split_drive},
+find_slice_in_slice,
+hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+audited: HashSet,
+audited_dirs: HashSet,
+root: PathBuf,
+}
+
+impl PathAuditor {
+pub fn new(root: impl AsRef) -> Self {
+Self {
+root: root.as_ref().to_owned(),
+..Default::default()
+}
+}
+pub fn audit_path(
+&mut self,
+path: impl AsRef,
+) -> Result<(), HgPathError> {
+// TODO windows "localpath" normalization
+let path = path.as_ref();
+if path.is_empty() {
+return Ok(());
+}
+// TODO case normalization
+if self.audited.contains(path) {
+return Ok(());
+}
+// AIX ignores "/" at end of path, others raise EISDIR.
+let last_byte = path.as_bytes()[path.len() - 1];
+if last_byte == b'/' || last_byte == b'\\' {
+return Err(HgPathError::EndsWithSlash(path.to_owned()));
+}
+let parts: Vec<_> = path
+.as_bytes()
+.split(|c| *c as char == std::path::MAIN_SEPARATOR)
+.collect();
+if !split_drive(path).0.is_empty()
+|| [&b".hg"[..], &b".hg."[..], &b""[..]]
+.contains(&lower_clean(parts[0]).as_ref())
+|| parts.iter().any(|c| c == b"..")
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+
+// Windows shortname aliases
+for part in parts.iter() {
+if part.contains(&b'~') {
+let mut split = part.splitn(1, |b| *b == b'~');
+let mut first = split.next().unwrap().to_owned();
+first.make_ascii_uppercase();
+let last = split.next().unwrap();
+if last.iter().all(|b| (*b as char).is_digit(10))
+&& [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())
+{
+return Err(HgPathError::ContainsIllegalComponent(
+path.to_owned(),
+));
+}
+}
+}
+if find_slice_in_slice(&lower_clean(path.as_bytes()), b".hg").is_some()
+{
+let lower_parts: Vec<_> =
+parts.iter().map(|p| lower_clean(p)).collect();
+for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+if lower_parts[1..].contains(pattern) {
+let pos = lower_parts
+.iter()
+.position(|part| part == pattern)
+.unwrap();
+let base = lower_parts[..pos]
+.iter()
+.fold(HgPathBuf::new(), |acc, p| {
+acc.join(HgPath::new(p))
+});
+return Err(HgPathError::IsInsideNestedRepo {
+path: path.to_owned(),
+nested_repo: base,
+});
+}
+}
+}
+
+let parts = &parts[..parts.len().saturating_sub(1)];
+
+let mut prefixes = vec![];
+
+// It's important that we check the path parts starting from the root.
+// This means we won't accidentally traverse a symlink into some other
+// filesystem (which is potentially expensive to access).
+for index in 0..parts.len() {
+let prefix =
+&parts[..index + 1].join(&(std::path::MAIN_SEPARATOR as u8));
+let prefix = HgPath::new(prefix);
+if self.audited_dirs.contains(