jonkeane commented on code in PR #41969:
URL: https://github.com/apache/arrow/pull/41969#discussion_r1626854137


##########
r/R/metadata.R:
##########
@@ -44,23 +44,96 @@
 }
 
 .deserialize_arrow_r_metadata <- function(x) {
-  tryCatch(
-    expr = {
-      out <- unserialize(charToRaw(x))
-
-      # if this is still raw, try decompressing
-      if (is.raw(out)) {
-        out <- unserialize(memDecompress(out, type = "gzip"))
-      }
-      out
-    },
+  tryCatch(unserialize_r_metadata(x),
     error = function(e) {
+      if (getOption("arrow.debug", FALSE)) {
+        print(conditionMessage(e))
+      }
       warning("Invalid metadata$r", call. = FALSE)
       NULL
     }
   )
 }
 
+unserialize_r_metadata <- function(x) {
+  # Check that this is ASCII serialized data (as in, what we wrote)
+  if (!identical(substr(unclass(x), 1, 1), "A")) {

Review Comment:
   I know it's not strictly necessary, but would asserting that this is `ARROW` 
be a bit more obvious?



##########
r/R/metadata.R:
##########
@@ -44,23 +44,96 @@
 }
 
 .deserialize_arrow_r_metadata <- function(x) {
-  tryCatch(
-    expr = {
-      out <- unserialize(charToRaw(x))
-
-      # if this is still raw, try decompressing
-      if (is.raw(out)) {
-        out <- unserialize(memDecompress(out, type = "gzip"))
-      }
-      out
-    },
+  tryCatch(unserialize_r_metadata(x),
     error = function(e) {
+      if (getOption("arrow.debug", FALSE)) {
+        print(conditionMessage(e))
+      }
       warning("Invalid metadata$r", call. = FALSE)
       NULL
     }
   )
 }
 
+unserialize_r_metadata <- function(x) {
+  # Check that this is ASCII serialized data (as in, what we wrote)
+  if (!identical(substr(unclass(x), 1, 1), "A")) {
+    stop("Invalid serialized data")
+  }
+  out <- safe_unserialize(charToRaw(x))
+  # If it's still raw, check for the gzip magic number and uncompress
+  if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) {

Review Comment:
   `as.raw(c(31, 139))` is magic, indeed.



##########
r/R/metadata.R:
##########
@@ -44,23 +44,96 @@
 }
 
 .deserialize_arrow_r_metadata <- function(x) {
-  tryCatch(
-    expr = {
-      out <- unserialize(charToRaw(x))
-
-      # if this is still raw, try decompressing
-      if (is.raw(out)) {
-        out <- unserialize(memDecompress(out, type = "gzip"))
-      }
-      out
-    },
+  tryCatch(unserialize_r_metadata(x),
     error = function(e) {
+      if (getOption("arrow.debug", FALSE)) {
+        print(conditionMessage(e))
+      }
       warning("Invalid metadata$r", call. = FALSE)
       NULL
     }
   )
 }
 
+unserialize_r_metadata <- function(x) {
+  # Check that this is ASCII serialized data (as in, what we wrote)
+  if (!identical(substr(unclass(x), 1, 1), "A")) {
+    stop("Invalid serialized data")
+  }
+  out <- safe_unserialize(charToRaw(x))
+  # If it's still raw, check for the gzip magic number and uncompress
+  if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) {
+    decompressed <- memDecompress(out, type = "gzip")
+    if (!identical(substr(decompressed, 1, 1), "A")) {
+      stop("Invalid serialized data")
+    }
+    out <- safe_unserialize(decompressed)
+  }
+  if (!is.list(out)) {
+    stop("Invalid serialized data")
+  }
+  safe_r_metadata(out)
+}
+
+safe_unserialize <- function(x) {
+  # By capturing the data in a list, we can inspect it for promises without
+  # triggering their evaluation.
+  out <- list(unserialize(x))
+  if (typeof(out[[1]]) == "promise") {
+    stop("Serialized data contains a promise object")

Review Comment:
   ```suggestion
       stop("Invalid serialized data: Serialized data contains a promise 
object")
   ```
   
   Up for other suggestions, but it would be good to make it clear that 
Serialized data containing a promise is problematic.



##########
r/R/metadata.R:
##########
@@ -44,23 +44,96 @@
 }
 
 .deserialize_arrow_r_metadata <- function(x) {
-  tryCatch(
-    expr = {
-      out <- unserialize(charToRaw(x))
-
-      # if this is still raw, try decompressing
-      if (is.raw(out)) {
-        out <- unserialize(memDecompress(out, type = "gzip"))
-      }
-      out
-    },
+  tryCatch(unserialize_r_metadata(x),
     error = function(e) {
+      if (getOption("arrow.debug", FALSE)) {
+        print(conditionMessage(e))
+      }
       warning("Invalid metadata$r", call. = FALSE)
       NULL
     }
   )
 }
 
+unserialize_r_metadata <- function(x) {
+  # Check that this is ASCII serialized data (as in, what we wrote)
+  if (!identical(substr(unclass(x), 1, 1), "A")) {
+    stop("Invalid serialized data")
+  }
+  out <- safe_unserialize(charToRaw(x))
+  # If it's still raw, check for the gzip magic number and uncompress
+  if (is.raw(out) && identical(out[1:2], as.raw(c(31, 139)))) {
+    decompressed <- memDecompress(out, type = "gzip")
+    if (!identical(substr(decompressed, 1, 1), "A")) {
+      stop("Invalid serialized data")
+    }
+    out <- safe_unserialize(decompressed)
+  }
+  if (!is.list(out)) {
+    stop("Invalid serialized data")
+  }
+  safe_r_metadata(out)
+}
+
+safe_unserialize <- function(x) {
+  # By capturing the data in a list, we can inspect it for promises without
+  # triggering their evaluation.
+  out <- list(unserialize(x))
+  if (typeof(out[[1]]) == "promise") {
+    stop("Serialized data contains a promise object")
+  }
+  out[[1]]
+}
+
+safe_r_metadata <- function(metadata, on_save = FALSE) {

Review Comment:
   `on_save` is to make if `safe_r_metadata` is saving metadata or loading 
metadata, yeah? It would be nice to either have doc strings (even just as a 
comment) explaining that, or maybe `saving = FALSE` is slightly more 
transparent to me?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to