Hannah-Jiang commented on a change in pull request #13117:
URL: https://github.com/apache/beam/pull/13117#discussion_r504979231



##########
File path: sdks/java/container/license_scripts/pull_licenses_java.py
##########
@@ -101,13 +101,22 @@ def pull_from_url(file_name, url, dep, no_list):
 
 def pull_source_code(base_url, dir_name, dep):
     # base_url example: 
https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/
-    soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
+    try:
+      soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
+    except:
+      logging.error('Error reading source base from 
{base_url}'.format(base_url=base_url))
+      raise
+    source_count = 0
     for href in (a["href"] for a in soup.select("a[href]")):
         if href.endswith(
-                '.jar') and not 'javadoc' in href:  # download jar file only
+                '.jar') and 'sources.jar' in href:  # download sources jar 
file only

Review comment:
       Do all source jars follow this pattern?

##########
File path: sdks/java/container/license_scripts/pull_licenses_java.py
##########
@@ -101,13 +101,22 @@ def pull_from_url(file_name, url, dep, no_list):
 
 def pull_source_code(base_url, dir_name, dep):
     # base_url example: 
https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/
-    soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
+    try:
+      soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
+    except:
+      logging.error('Error reading source base from 
{base_url}'.format(base_url=base_url))
+      raise
+    source_count = 0
     for href in (a["href"] for a in soup.select("a[href]")):
         if href.endswith(
-                '.jar') and not 'javadoc' in href:  # download jar file only
+                '.jar') and 'sources.jar' in href:  # download sources jar 
file only
             file_name = dir_name + '/' + href
             url = base_url + '/' + href
+            logging.info('Pulling source from {url}'.format(url=url))

Review comment:
       How about changing the level to `debug`? If there are many packages need 
to pull source, there would be a lot of log printed out. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to