udim commented on a change in pull request #11428: [BEAM-9764] multi threading 
& check urls instead of pulling
URL: https://github.com/apache/beam/pull/11428#discussion_r410403628
 
 

 ##########
 File path: sdks/java/container/license_scripts/pull_licenses_java.py
 ##########
 @@ -108,89 +244,27 @@ def write_to_csv(csv_dict):
 
     maven_url_temp = 'https://repo1.maven.org/maven2/{module}/{version}'
 
-    for dep in dependencies['dependencies']:
-        '''
-        An example of a Json blob.
-        {
-            "moduleName": "antlr:antlr",
-            "moduleUrl": "http://www.antlr.org/";,
-            "moduleVersion": "2.7.7",
-            "moduleLicense": "BSD License",
-            "moduleLicenseUrl": "http://www.antlr.org/license.html";
-        }
-        '''
-        name = dep['moduleName'].split(':')[1].lower()
-        version = dep['moduleVersion']
-        name_version = name + '-' + version
-        dir_name = '{license_dir}/{name_version}.jar'.format(
-            license_dir=LICENSE_DIR, name_version=name_version)
-        # if auto pulled, directory is existing at {license_dir}
-        if not os.path.isdir(dir_name):
-            # skip self dependencies
-            if dep['moduleName'].startswith('beam'):
-                print('Skippig', name + '-' + version)
-                continue
-            os.mkdir(dir_name)
-            # pull license
-            try:
-                license_url = dep_config[name][version]['license']
-            except:
-                license_url = dep['moduleLicenseUrl']
-            pull_from_url(dir_name + '/LICENSE', license_url, name_version,
-                          no_licenses)
-            # pull notice
-            try:
-                notice_url = dep_config[name][version]['notice']
-                pull_from_url(dir_name + '/NOTICE', notice_url, name_version,
-                              no_licenses)
-            except:
-                notice_url = None
-        else:
-            try:
-                license_url = dep['moduleLicenseUrl']
-            except:
-                license_url = ''
-            print(
-                'License/notice for {name_version} were pulled automatically.'.
-                format(name_version=name_version))
-
-        # get license_type to decide if pull source code.
-        try:
-            license_type = dep['moduleLicense']
-        except:
-            try:
-                license_type = dep_config[name][version]['type']
-            except:
-                no_license_type.add(name_version)
-                license_type = ''
-                continue
+    csv_list = []
+    no_licenses = []
+    no_license_type = []
+    incorrect_source_url = []
 
-        # pull source code if license_type is one of 
SOURCE_CODE_REQUIRED_LICENSES.
-        if any(x in license_type.lower()
-               for x in SOURCE_CODE_REQUIRED_LICENSES):
-            try:
-                base_url = dep_config[name][version]['source']
-            except:
-                module = dep['moduleName'].split(':')[0].replace('.', '/')
-                base_url = maven_url_temp.format(module=module + '/' + name,
-                                                 version=version)
-            pull_source_code(base_url, dir_name, name_version,
-                             incorrect_source_url)
-            source_included = True
-        else:
-            source_included = False
-
-        csv_dict[name_version] = {
-            'url_to_license': license_url,
-            'license_type': license_type,
-            'source_included': source_included
-        }
+    queue = Queue()
+    for x in range(THREADS):
+        worker = Worker(queue)
+        worker.daemon = True
+        worker.start()
+    for dep in dependencies['dependencies']:
+        queue.put(dep)
+    queue.join()
 
 
 Review comment:
   This looks good, except you're not waiting for all threads to finish so the 
csv_list might be incomplete.
   
   A ThreadPoolExecutor would be useful here and do what you want: 
   Rough example:
   ```py
   with concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) as executor:
     futures = [executor.submit(execute, dep) for dep in 
dependencies['dependencies']]
     ... # get the future results as they become available, see example link 
below
   ```
   
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
   
   Currently the code in `execute` does `csv_list.append(csv_dict)` at the end, 
which needs synchronization to work right (I assume that csv_list is shared 
among threads). Returning csv_dict instead will make it available as a 
`future.result()`.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to