udim commented on a change in pull request #14431:
URL: https://github.com/apache/beam/pull/14431#discussion_r616961819



##########
File path: examples/notebooks/tour-of-beam/reading-and-writing-data.ipynb
##########
@@ -423,28 +422,36 @@
           "base_uri": "https://localhost:8080/";
         },
         "id": "ywVbJxegaZbo",
-        "outputId": "5e0adfa3-e685-4fe0-b6b7-bfa3d8469da1"
+        "outputId": "042ee6be-d4d6-4983-9a7c-5f876ba0f655"
       },
       "source": [
         "import apache_beam as beam\n",
+        "from apache_beam.io.filesystems import FileSystems as beam_fs\n",
+        "import codecs\n",
         "import csv\n",
-        "import glob\n",
         "\n",
         "class ReadCsvFiles(beam.PTransform):\n",
         "  def __init__(self, file_patterns):\n",
         "    self.file_patterns = file_patterns\n",
         "\n",
         "  @staticmethod\n",
+        "  def expand_pattern(pattern):\n",
+        "    for match_result in beam_fs.match([pattern])[0].metadata_list:\n",
+        "      yield match_result.path\n",
+        "\n",
+        "  @staticmethod\n",
         "  def read_csv_lines(file_name):\n",
-        "    with open(file_name, 'r') as f:\n",
-        "      for row in csv.DictReader(f):\n",
+        "    with beam_fs.open(file_name) as f:\n",
+        "      # Beam reads files as bytes, but csv expects strings,\n",
+        "      # so we need to decode the bytes into utf-8 strings.\n",
+        "      for row in csv.DictReader(codecs.iterdecode(f, 'utf-8')):\n",
         "        yield dict(row)\n",
         "\n",
         "  def expand(self, pcollection):\n",
         "    return (\n",
         "        pcollection\n",
         "        | 'Create file patterns' >> 
beam.Create(self.file_patterns)\n",
-        "        | 'Expand file patterns' >> beam.FlatMap(glob.glob)\n",
+        "        | 'Expand file patterns' >> 
beam.FlatMap(self.expand_pattern)\n",

Review comment:
       Would Beam's `ReadAllFromText` work better? It handles glob matching and 
utf-8 decoding.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to