MaxGekk commented on a change in pull request #26079: [SPARK-29369][SQL] 
Support string intervals without the `interval` prefix
URL: https://github.com/apache/spark/pull/26079#discussion_r333579599
 
 

 ##########
 File path: 
common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
 ##########
 @@ -73,45 +72,53 @@ private static long toLong(String s) {
    * This method is case-insensitive.
    */
   public static CalendarInterval fromString(String s) {
-    if (s == null) {
-      return null;
-    }
-    s = s.trim();
-    Matcher m = p.matcher(s);
-    if (!m.matches() || s.compareToIgnoreCase("interval") == 0) {
+    try {
+      return fromCaseInsensitiveString(s);
+    } catch (IllegalArgumentException e) {
       return null;
-    } else {
-      long months = toLong(m.group(1)) * 12 + toLong(m.group(2));
-      long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK;
-      microseconds += toLong(m.group(4)) * MICROS_PER_DAY;
-      microseconds += toLong(m.group(5)) * MICROS_PER_HOUR;
-      microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE;
-      microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
-      microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
-      microseconds += toLong(m.group(9));
-      return new CalendarInterval((int) months, microseconds);
     }
   }
 
   /**
-   * Convert a string to CalendarInterval. Unlike fromString, this method can 
handle
+   * Convert a string to CalendarInterval. This method can handle
    * strings without the `interval` prefix and throws IllegalArgumentException
    * when the input string is not a valid interval.
    *
    * @throws IllegalArgumentException if the string is not a valid internal.
    */
   public static CalendarInterval fromCaseInsensitiveString(String s) {
-    if (s == null || s.trim().isEmpty()) {
-      throw new IllegalArgumentException("Interval cannot be null or blank.");
+    if (s == null) {
+      throw new IllegalArgumentException("Interval cannot be null");
     }
-    String sInLowerCase = s.trim().toLowerCase(Locale.ROOT);
-    String interval =
-      sInLowerCase.startsWith("interval ") ? sInLowerCase : "interval " + 
sInLowerCase;
-    CalendarInterval cal = fromString(interval);
-    if (cal == null) {
+    String trimmed = s.trim();
+    if (trimmed.isEmpty()) {
+      throw new IllegalArgumentException("Interval cannot be blank");
+    }
+    String prefix = "interval";
+    String intervalStr = trimmed;
+    // Checks the given interval string does not start with the `interval` 
prefix
+    if (!intervalStr.regionMatches(true, 0, prefix, 0, prefix.length())) {
+      // Prepend `interval` if it does not present because
+      // the regular expression strictly require it.
 
 Review comment:
   > String intervalStr = trimmed.toLowerCase();
   
   Your code is more expensive because you lower case whole input string.
   
   > // parse the interval string assuming there is no leading "interval"
   
   Here there is a problem with current regexp when you delete the anchor 
`"interval"`. Without this anchor, it cannot match to valid inputs:
   ```scala
   scala> import java.util.regex._
   import java.util.regex._
   
   scala> def unitRegex(unit: String) = "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?"
   unitRegex: (unit: String)String
   
   scala> val p = Pattern.compile(unitRegex("year") + unitRegex("month") +
        |     unitRegex("week") + unitRegex("day") + unitRegex("hour") + 
unitRegex("minute") +
        |     unitRegex("second") + unitRegex("millisecond") + 
unitRegex("microsecond"),
        |     Pattern.CASE_INSENSITIVE)
   p: java.util.regex.Pattern = 
(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?
   
   scala> val m = p.matcher("1 month 1 second")
   m: java.util.regex.Matcher = 
java.util.regex.Matcher[pattern=(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?
 region=0,16 lastmatch=]
   
   scala> m.matches()
   res7: Boolean = false
   ```
   If we added it back:
   ```scala
   scala> val p = Pattern.compile("interval" + unitRegex("year") + 
unitRegex("month") +
        |     unitRegex("week") + unitRegex("day") + unitRegex("hour") + 
unitRegex("minute") +
        |     unitRegex("second") + unitRegex("millisecond") + 
unitRegex("microsecond"),
        |     Pattern.CASE_INSENSITIVE)
   p: java.util.regex.Pattern = 
interval(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?
   
   scala> val m = p.matcher("interval 1 month 1 second")
   m: java.util.regex.Matcher = 
java.util.regex.Matcher[pattern=interval(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?
 region=0,25 lastmatch=]
   
   scala> m.matches()
   res8: Boolean = true
   ```
   it can match now. That's why I had to add the `interval` prefix instead of 
removing it.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to