[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 157 +-- 1 file changed, 141 insertions(+), 16 deletions(-) New commits: commit 001ab94b0af51d9908ce078b4f422a87ad79d971 Author: gulsahkose AuthorDate: Tue Oct 19 17:10:03 2021 +0300 Commit: gulsahkose CommitDate: Tue Oct 19 17:10:03 2021 +0300 Unparsable commit is sent to fetch local and remote repo. Change-Id: I115d0097c14e192c0c93a8aabd186fc7e4296f23 diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index cc7a7a036980..93150e6897f2 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -1,6 +1,7 @@ #!/usr/bin/python import sys, getopt, os, shutil +from typing import Type import xml.etree.ElementTree as ET from zipfile import ZipFile from lxml import etree @@ -10,24 +11,27 @@ def main(argv): outputdir = '' extracted_files_dir_by_user = '' extracted_files_dir = '' +fileformat = '' #read the arguments try: - opts, args = getopt.getopt(argv,"hi:o:e:",["idir=","odir="]) + opts, args = getopt.getopt(argv,"hi:o:e:t:",["idir=","odir="]) except getopt.GetoptError: - print ('analyze.py -i -o ') + print ('analyze.py -i -o -t ') sys.exit(2) for opt, arg in opts: - if opt == '-h': - print ('analyze.py -i -o ') - sys.exit() - elif opt == '-e': - extracted_files_dir_by_user = arg - elif opt in ("-i", "--idir"): - inputdir = arg - elif opt in ("-o", "--odir"): - outputdir = arg +if opt == '-h': +print ('analyze.py -i -o -t ') +sys.exit() +elif opt == '-e': +extracted_files_dir_by_user = arg +elif opt in ("-i", "--idir"): +inputdir = arg +elif opt in ("-o", "--odir"): +outputdir = arg +elif opt == '-t': +fileformat = arg if(extracted_files_dir_by_user == ''): # use default directory path for extracted ooxml files. @@ -45,7 +49,14 @@ def main(argv): sub_texts_name = ext_dir[i+1:] + ".text" sub_result_list = [] concatenated_texts_list = [] # holds concanated texts for each paragraph -count_elements(ext_dir, sub_result_list, concatenated_texts_list) + +if fileformat == "pptx": +count_pptx_elements(ext_dir, sub_result_list, concatenated_texts_list) +elif fileformat == "xlsx": +count_xlsx_elements(ext_dir, sub_result_list) +else: +print("File format is not supported") +break sub_result_path = os.path.join(outputdir, sub_result_name) sub_texts_path = os.path.join(outputdir, sub_texts_name) @@ -69,8 +80,8 @@ def main(argv): log_file.close() # no need to keep extracted files anymore. -if(os.path.exists(extracted_files_dir)): -shutil.rmtree(extracted_files_dir) +#if(os.path.exists(extracted_files_dir)): +#shutil.rmtree(extracted_files_dir) # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -80,6 +91,7 @@ def extract_files(inputdir, extracted_files_dir): shutil.rmtree(extracted_files_dir) # unzip files into the extracted files directory + for filetype in get_list_of_subdir(inputdir): for filename in os.listdir(filetype): if (filename.endswith(".pptx") or \ @@ -119,13 +131,126 @@ def replace_namespace_with_alias(filename, element): # decides which files should/shouldn't be analyzed. def is_file_in_accepted_files(filename): -if(filename.endswith(".xml") and "ppt/slides/" in filename): +if(filename.endswith(".xml") and ("ppt/slides/" in filename or "xl/worksheets" in filename)): return True return False +def read_shared_strings(shared_strings_list, shared_strings_path): +tree = ET.parse(shared_strings_path) +for child in tree.iter(): +if child.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}t': +shared_strings_list.append(child.text) + +def get_pivot_table_range(sheet_relation_path): +tree = ET.parse(sheet_relation_path) +for elem in tree.iter(): +if elem.tag == "{http://schemas.openxmlformats.org/package/2006/relationships}Relationship; and\ + elem.attrib['Type'] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable": +i = sheet_relation_path.rfind('/') +pivot_table_path = os.path.join(sheet_relation_path[:i], ".." ,elem.attrib['Target']) +p_tree = ET.parse(pivot_table_path) +for p_elem in p_tree.iter(): +if p_elem.tag == "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}location; and \ + p_elem.attrib['ref']: +return p_elem.attrib['ref'] +return '' + +def is_cell_in_range(cell_id, cell_range): +i =
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 100 +-- 1 file changed, 42 insertions(+), 58 deletions(-) New commits: commit 18e89687fde3b3cfac00ead00cbefbb98262cdfe Author: Gülşah Köse AuthorDate: Tue Jul 20 14:15:42 2021 +0300 Commit: Gülşah Köse CommitDate: Tue Jul 20 14:20:55 2021 +0300 remove namespace replacing and some small updates Change-Id: I2d56668186c8745fca683025710646ae505a0d6b diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 87acd377c854..cc7a7a036980 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -80,21 +80,22 @@ def extract_files(inputdir, extracted_files_dir): shutil.rmtree(extracted_files_dir) # unzip files into the extracted files directory -for filename in os.listdir(inputdir): -if (filename.endswith(".pptx") or \ -filename.endswith(".docx") or \ -filename.endswith(".xlsx")) and not \ -filename.startswith("~"): -filepath = os.path.join(inputdir, filename) -extracted_file_path = os.path.join(extracted_files_dir, filename) - -try: -with ZipFile(filepath) as zipObj: -zipObj.extractall(extracted_file_path) -except: -print("%s is problematic" % filename) -else: -continue +for filetype in get_list_of_subdir(inputdir): +for filename in os.listdir(filetype): +if (filename.endswith(".pptx") or \ +filename.endswith(".docx") or \ +filename.endswith(".xlsx")) and not \ +filename.startswith("~"): +filepath = os.path.join(filetype, filename) +extracted_file_path = os.path.join(extracted_files_dir, filename) + +try: +with ZipFile(filepath) as zipObj: +zipObj.extractall(extracted_file_path) +except: +print("%s is problematic" % filename) +else: +continue # get key of value in dictionary def get_key(val, dict): @@ -116,34 +117,17 @@ def replace_namespace_with_alias(filename, element): element = element.replace("{" + element_ns + "}", "") return element -# decides which files shouldn't be analyzed. +# decides which files should/shouldn't be analyzed. def is_file_in_accepted_files(filename): -if(filename.endswith("[Content_Types].xml") or \ - filename.endswith("docProps/custom.xml") or \ - filename.endswith("docProps/app.xml") or\ - filename.endswith("presentation.xml") or \ - filename.endswith("viewProps.xml") or \ - filename.endswith("tableStyles.xml") or \ - filename.endswith("presProps.xml") or \ - "ppt/slideLayouts" in filename or \ - "ppt/slideMasters" in filename or \ - "ppt/theme" in filename or \ - "ppt/notesMasters" in filename or \ - "ppt/notesSlides" in filename or \ - "ppt/handoutMasters" in filename or \ - "ppt/tags" in filename or \ - "pptx/customXml" in filename or \ - "ppt/diagrams" in filename or \ - filename.endswith("docProps/core.xml") or not \ - filename.endswith(".xml")): - return False - -return True +if(filename.endswith(".xml") and "ppt/slides/" in filename): + return True + +return False # counts tags, attribute names and values of xmls def count_elements(extracted_files_dir, result_list, concanated_texts_list): -# make sure if extracted files directory exist +# make sure if extracted files directory not exist if not (os.path.exists(extracted_files_dir)): print("Extracted files directory is not exist") return @@ -160,7 +144,7 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list): try: # start to count for event, child in etree.iterparse(xmlfile, events=('start', 'end')): -tag = replace_namespace_with_alias(xmlfile, child.tag) +tag = child.tag #replace_namespace_with_alias(xmlfile, child.tag) tag_idx = get_index_of_tag(tag, result_list) if event == "start": @@ -171,30 +155,29 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list): else: result_list[tag_idx][0][tag] += 1 -# count attribute names and values of current tag -for attr_name, attr_value in child.attrib.items(): -attr_name = replace_namespace_with_alias(xmlfile, attr_name) -if not attr_name in result_list[tag_idx][1].keys(): -result_list[tag_idx][1][attr_name] = 1 -else: -result_list[tag_idx][1][attr_name] +=1 - -if not
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 103 --- 1 file changed, 57 insertions(+), 46 deletions(-) New commits: commit 7c03a4c092d9cba10ecb22e7f97aaca851259f1f Author: Gülşah Köse AuthorDate: Tue Jun 8 08:43:30 2021 +0300 Commit: Gülşah Köse CommitDate: Tue Jun 8 08:43:30 2021 +0300 Improve error handling, exclude None texts Change-Id: Idedad9c414311d95c355ea70a913f8e0ddf7 diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index a7e2bc2a549f..87acd377c854 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -88,8 +88,11 @@ def extract_files(inputdir, extracted_files_dir): filepath = os.path.join(inputdir, filename) extracted_file_path = os.path.join(extracted_files_dir, filename) -with ZipFile(filepath) as zipObj: -zipObj.extractall(extracted_file_path) +try: +with ZipFile(filepath) as zipObj: +zipObj.extractall(extracted_file_path) +except: +print("%s is problematic" % filename) else: continue @@ -126,6 +129,11 @@ def is_file_in_accepted_files(filename): "ppt/slideMasters" in filename or \ "ppt/theme" in filename or \ "ppt/notesMasters" in filename or \ + "ppt/notesSlides" in filename or \ + "ppt/handoutMasters" in filename or \ + "ppt/tags" in filename or \ + "pptx/customXml" in filename or \ + "ppt/diagrams" in filename or \ filename.endswith("docProps/core.xml") or not \ filename.endswith(".xml")): return False @@ -149,51 +157,54 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list): print(xmlfile) -# start to count -for event, child in etree.iterparse(xmlfile, events=('start', 'end')): -tag = replace_namespace_with_alias(xmlfile, child.tag) -tag_idx = get_index_of_tag(tag, result_list) - -if event == "start": -# count tags -if (tag_idx == -1): -tmp_list = [{tag: 1},{},{},{}] -result_list.append(tmp_list) -else: -result_list[tag_idx][0][tag] += 1 - -# count attribute names and values of current tag -for attr_name, attr_value in child.attrib.items(): -attr_name = replace_namespace_with_alias(xmlfile, attr_name) -if not attr_name in result_list[tag_idx][1].keys(): -result_list[tag_idx][1][attr_name] = 1 +try: +# start to count +for event, child in etree.iterparse(xmlfile, events=('start', 'end')): +tag = replace_namespace_with_alias(xmlfile, child.tag) +tag_idx = get_index_of_tag(tag, result_list) + +if event == "start": +# count tags +if (tag_idx == -1): +tmp_list = [{tag: 1},{},{},{}] +result_list.append(tmp_list) else: -result_list[tag_idx][1][attr_name] +=1 - -if not attr_value in result_list[tag_idx][2].keys(): -result_list[tag_idx][2][attr_value] = 1 -else: -result_list[tag_idx][2][attr_value] +=1 - -# concanated text will be resetted in every paragraph begining -if tag == "a:p": -concatenated_text = "" - - -if event == "end": -# Detect seperate texts in paragraph and concanate them. -if tag == "a:t": -concatenated_text += str(child.text) -# End of the paragraph element, add the text as list item. -if tag == "a:p" and concatenated_text != "": -concanated_texts_list.append(concatenated_text) - -# count text contents except consisted of whitespaces. -if not (str(child.text) == "None" or str(child.text).strip()==""): -if not child.text in result_list[tag_idx][3].keys(): -result_list[tag_idx][3][child.text] = 1 -else: -result_list[tag_idx][3][child.text] += 1 +result_list[tag_idx][0][tag] += 1 + +# count attribute names and values of current tag +for attr_name, attr_value in child.attrib.items(): +attr_name = replace_namespace_with_alias(xmlfile, attr_name) +if not attr_name in result_list[tag_idx][1].keys(): +result_list[tag_idx][1][attr_name] = 1 +else: +result_list[tag_idx][1][attr_name] +=1 + +if not
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 99 --- 1 file changed, 64 insertions(+), 35 deletions(-) New commits: commit f2bde987693fad6e1347f99e34c2ad5291ea8ee6 Author: Gülşah Köse AuthorDate: Thu Jun 3 14:11:05 2021 +0300 Commit: Gülşah Köse CommitDate: Thu Jun 3 14:15:14 2021 +0300 Concanate seperate texts runs and create seperate result file fot it. For eg: text1 text2 We will keep the result text as "text1text2" As result we will create .text to hold that type texts. Change-Id: I946af39e2037db1f986e73039d0a462a36bba1d8 diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 9db39d8c47da..a7e2bc2a549f 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -1,8 +1,9 @@ #!/usr/bin/python -import sys, getopt, os, shutil, pprint +import sys, getopt, os, shutil import xml.etree.ElementTree as ET from zipfile import ZipFile +from lxml import etree def main(argv): inputdir = '' @@ -28,9 +29,6 @@ def main(argv): elif opt in ("-o", "--odir"): outputdir = arg -# holds the result structer of analyze -result_list = [] - if(extracted_files_dir_by_user == ''): # use default directory path for extracted ooxml files. extracted_files_dir = os.path.join(outputdir, 'extractedfiles') @@ -40,22 +38,39 @@ def main(argv): extracted_files_dir = extracted_files_dir_by_user # create seperate result files for each ooxml document as .result in output directory +# create seperate concanated texts for each ooxml document as .text in output directory for ext_dir in get_list_of_subdir(extracted_files_dir): i = ext_dir.rfind('/') sub_result_name = ext_dir[i+1:] + ".result" +sub_texts_name = ext_dir[i+1:] + ".text" sub_result_list = [] -count_elements(ext_dir, sub_result_list) +concatenated_texts_list = [] # holds concanated texts for each paragraph +count_elements(ext_dir, sub_result_list, concatenated_texts_list) + sub_result_path = os.path.join(outputdir, sub_result_name) +sub_texts_path = os.path.join(outputdir, sub_texts_name) # sort the result sub list according to tag names sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) +concatenated_texts_list.sort() if os.path.exists(sub_result_path): os.remove(sub_result_path) +if os.path.exists(sub_texts_path): +os.remove(sub_texts_path) + for i in sub_result_list: with open(sub_result_path, "a") as log_file: print(i, file=log_file) -log_file.close() +log_file.close() +for i in concatenated_texts_list: +with open(sub_texts_path, "a") as log_file: +print(i, file=log_file) +log_file.close() + +# no need to keep extracted files anymore. +if(os.path.exists(extracted_files_dir)): +shutil.rmtree(extracted_files_dir) # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -98,6 +113,7 @@ def replace_namespace_with_alias(filename, element): element = element.replace("{" + element_ns + "}", "") return element +# decides which files shouldn't be analyzed. def is_file_in_accepted_files(filename): if(filename.endswith("[Content_Types].xml") or \ filename.endswith("docProps/custom.xml") or \ @@ -109,6 +125,7 @@ def is_file_in_accepted_files(filename): "ppt/slideLayouts" in filename or \ "ppt/slideMasters" in filename or \ "ppt/theme" in filename or \ + "ppt/notesMasters" in filename or \ filename.endswith("docProps/core.xml") or not \ filename.endswith(".xml")): return False @@ -116,7 +133,7 @@ def is_file_in_accepted_files(filename): return True # counts tags, attribute names and values of xmls -def count_elements(extracted_files_dir, result_list): +def count_elements(extracted_files_dir, result_list, concanated_texts_list): # make sure if extracted files directory exist if not (os.path.exists(extracted_files_dir)): @@ -131,40 +148,52 @@ def count_elements(extracted_files_dir, result_list): continue print(xmlfile) -tree = ET.parse(xmlfile) -root = tree.getroot() # start to count -for child in root.iter(): +for event, child in etree.iterparse(xmlfile, events=('start', 'end')): tag = replace_namespace_with_alias(xmlfile, child.tag) tag_idx = get_index_of_tag(tag, result_list) -# count tags -if (tag_idx == -1): -tmp_list = [{tag: 1},{},{},{}] -result_list.append(tmp_list) -else: -result_list[tag_idx][0][tag] += 1 - -
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 38 ++ 1 file changed, 22 insertions(+), 16 deletions(-) New commits: commit 056ebfae35f6725b9089439a7bf868dad48fdd0f Author: Gülşah Köse AuthorDate: Mon May 31 16:08:23 2021 +0300 Commit: Gülşah Köse CommitDate: Mon May 31 16:08:28 2021 +0300 Fix the use of exist extracted files path and change result output Tool was counting the text context that consists of whitespaces. Prevent this, not count that texts as text contexts eg: " " Change-Id: Ib71123b82082166addd423b734661a158ec2254e diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 12b9ba590db9..9db39d8c47da 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -8,6 +8,7 @@ def main(argv): inputdir = '' outputdir = '' extracted_files_dir_by_user = '' +extracted_files_dir = '' #read the arguments try: @@ -34,23 +35,27 @@ def main(argv): # use default directory path for extracted ooxml files. extracted_files_dir = os.path.join(outputdir, 'extractedfiles') extract_files(inputdir, extracted_files_dir) - -# create seperate result files for each ooxml document as .result in output directory -for ext_dir in get_list_of_subdir(extracted_files_dir): -i = ext_dir.rfind('/') -sub_result_name = ext_dir[i+1:] + ".result" -sub_result_list = [] -count_elements(ext_dir, sub_result_list) -sub_result_path = os.path.join(outputdir, sub_result_name) - -# sort the result sub list according to tag names -sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) - -with open(sub_result_path, "w") as log_file: -pprint.pprint(sub_result_list, log_file) else: # use user defined directory path for extracted ooxml files. -count_elements(extracted_files_dir_by_user, result_list) +extracted_files_dir = extracted_files_dir_by_user + +# create seperate result files for each ooxml document as .result in output directory +for ext_dir in get_list_of_subdir(extracted_files_dir): +i = ext_dir.rfind('/') +sub_result_name = ext_dir[i+1:] + ".result" +sub_result_list = [] +count_elements(ext_dir, sub_result_list) +sub_result_path = os.path.join(outputdir, sub_result_name) + +# sort the result sub list according to tag names +sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) + +if os.path.exists(sub_result_path): +os.remove(sub_result_path) +for i in sub_result_list: +with open(sub_result_path, "a") as log_file: +print(i, file=log_file) +log_file.close() # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -154,7 +159,8 @@ def count_elements(extracted_files_dir, result_list): else: result_list[tag_idx][2][attr_value] +=1 -if not (str(child.text) == "None"): +# count text contents except consisted of whitespaces. +if not (str(child.text) == "None" or str(child.text).strip()==""): if not child.text in result_list[tag_idx][3].keys(): result_list[tag_idx][3][child.text] = 1 else: ___ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 90 --- 1 file changed, 57 insertions(+), 33 deletions(-) New commits: commit fc03e6b942a9170bda5964f95893c18123b340e4 Author: Gülşah Köse AuthorDate: Wed May 26 18:25:11 2021 +0300 Commit: Gülşah Köse CommitDate: Wed May 26 18:25:11 2021 +0300 Export the accepted files part as function. And sort the sub result list Change-Id: I9b5c003b6363ac50cf7c838cc4e954c14ef935de diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 8dbfe8cacd0b..12b9ba590db9 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -42,6 +42,10 @@ def main(argv): sub_result_list = [] count_elements(ext_dir, sub_result_list) sub_result_path = os.path.join(outputdir, sub_result_name) + +# sort the result sub list according to tag names +sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) + with open(sub_result_path, "w") as log_file: pprint.pprint(sub_result_list, log_file) else: @@ -89,6 +93,23 @@ def replace_namespace_with_alias(filename, element): element = element.replace("{" + element_ns + "}", "") return element +def is_file_in_accepted_files(filename): +if(filename.endswith("[Content_Types].xml") or \ + filename.endswith("docProps/custom.xml") or \ + filename.endswith("docProps/app.xml") or\ + filename.endswith("presentation.xml") or \ + filename.endswith("viewProps.xml") or \ + filename.endswith("tableStyles.xml") or \ + filename.endswith("presProps.xml") or \ + "ppt/slideLayouts" in filename or \ + "ppt/slideMasters" in filename or \ + "ppt/theme" in filename or \ + filename.endswith("docProps/core.xml") or not \ + filename.endswith(".xml")): + return False + +return True + # counts tags, attribute names and values of xmls def count_elements(extracted_files_dir, result_list): @@ -101,40 +122,43 @@ def count_elements(extracted_files_dir, result_list): # parse xmls and count elements for xmlfile in list_of_files: -if(xmlfile.endswith(".xml")): -tree = ET.parse(xmlfile) -root = tree.getroot() - -# start to count -for child in root.iter(): -tag = replace_namespace_with_alias(xmlfile, child.tag) -tag_idx = get_index_of_tag(tag, result_list) - -# count tags -if (tag_idx == -1): -tmp_list = [{tag: 1},{},{},{}] -result_list.append(tmp_list) +if not is_file_in_accepted_files(xmlfile): +continue + +print(xmlfile) +tree = ET.parse(xmlfile) +root = tree.getroot() + +# start to count +for child in root.iter(): +tag = replace_namespace_with_alias(xmlfile, child.tag) +tag_idx = get_index_of_tag(tag, result_list) + +# count tags +if (tag_idx == -1): +tmp_list = [{tag: 1},{},{},{}] +result_list.append(tmp_list) +else: +result_list[tag_idx][0][tag] += 1 + +# count attribute names and values of current tag +for attr_name, attr_value in child.attrib.items(): +attr_name = replace_namespace_with_alias(xmlfile, attr_name) +if not attr_name in result_list[tag_idx][1].keys(): +result_list[tag_idx][1][attr_name] = 1 +else: +result_list[tag_idx][1][attr_name] +=1 + +if not attr_value in result_list[tag_idx][2].keys(): +result_list[tag_idx][2][attr_value] = 1 +else: +result_list[tag_idx][2][attr_value] +=1 + +if not (str(child.text) == "None"): +if not child.text in result_list[tag_idx][3].keys(): +result_list[tag_idx][3][child.text] = 1 else: -result_list[tag_idx][0][tag] += 1 - -# count attribute names and values of current tag -for attr_name, attr_value in child.attrib.items(): -attr_name = replace_namespace_with_alias(xmlfile, attr_name) -if not attr_name in result_list[tag_idx][1].keys(): -result_list[tag_idx][1][attr_name] = 1 -else: -result_list[tag_idx][1][attr_name] +=1 - -if not attr_value in result_list[tag_idx][2].keys(): -result_list[tag_idx][2][attr_value] = 1 -else: -result_list[tag_idx][2][attr_value] +=1 - -if not (str(child.text) == "None"): -if not child.text in result_list[tag_idx][3].keys(): -
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 31 ++- 1 file changed, 22 insertions(+), 9 deletions(-) New commits: commit a8b521dc0f8e810f97630551406ccd8d1590371f Author: Gülşah Köse AuthorDate: Wed May 26 16:57:23 2021 +0300 Commit: Gülşah Köse CommitDate: Wed May 26 16:57:23 2021 +0300 Create a sub result for each ooxml document in output directory Change-Id: Ibbb366725d344f8e44c085ced60c35e190f98a9d diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 3f9b0e8bdad1..8dbfe8cacd0b 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -35,13 +35,19 @@ def main(argv): extracted_files_dir = os.path.join(outputdir, 'extractedfiles') extract_files(inputdir, extracted_files_dir) -count_elements(extracted_files_dir, result_list) +# create seperate result files for each ooxml document as .result in output directory +for ext_dir in get_list_of_subdir(extracted_files_dir): +i = ext_dir.rfind('/') +sub_result_name = ext_dir[i+1:] + ".result" +sub_result_list = [] +count_elements(ext_dir, sub_result_list) +sub_result_path = os.path.join(outputdir, sub_result_name) +with open(sub_result_path, "w") as log_file: +pprint.pprint(sub_result_list, log_file) else: # use user defined directory path for extracted ooxml files. count_elements(extracted_files_dir_by_user, result_list) -pprint.pprint(result_list) - # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -49,9 +55,6 @@ def extract_files(inputdir, extracted_files_dir): if(os.path.exists(extracted_files_dir)): shutil.rmtree(extracted_files_dir) -# holds directory names for each ooxml document in extracted files dir. -counter = 1 - # unzip files into the extracted files directory for filename in os.listdir(inputdir): if (filename.endswith(".pptx") or \ @@ -59,12 +62,10 @@ def extract_files(inputdir, extracted_files_dir): filename.endswith(".xlsx")) and not \ filename.startswith("~"): filepath = os.path.join(inputdir, filename) -extracted_file_path = os.path.join(extracted_files_dir, str(counter)) +extracted_file_path = os.path.join(extracted_files_dir, filename) with ZipFile(filepath) as zipObj: zipObj.extractall(extracted_file_path) - -counter +=1 else: continue @@ -158,5 +159,17 @@ def get_list_of_files(directory_name): return all_files +def get_list_of_subdir(directory_name): + +list_of_file = os.listdir(directory_name) +subdirs = list() + +for filename in list_of_file: +full_path = os.path.join(directory_name, filename) +if os.path.isdir(full_path): +subdirs.append(full_path) + +return subdirs + if __name__ == "__main__": main(sys.argv[1:]) ___ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py
bin/ooxml-analyze.py | 28 1 file changed, 24 insertions(+), 4 deletions(-) New commits: commit dbb7762b1235ae245fd5b67046737edf5519fbd9 Author: Gülşah Köse AuthorDate: Wed May 26 16:47:12 2021 +0300 Commit: Gülşah Köse CommitDate: Wed May 26 16:47:12 2021 +0300 Replace namespaces with namespace aliases on result Change-Id: If29c0b5d9eb52a7d42a1d1482010653d2714c8fe diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index efc44bbfa32c..3f9b0e8bdad1 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -33,8 +33,8 @@ def main(argv): if(extracted_files_dir_by_user == ''): # use default directory path for extracted ooxml files. extracted_files_dir = os.path.join(outputdir, 'extractedfiles') - extract_files(inputdir, extracted_files_dir) + count_elements(extracted_files_dir, result_list) else: # use user defined directory path for extracted ooxml files. @@ -58,17 +58,36 @@ def extract_files(inputdir, extracted_files_dir): filename.endswith(".docx") or \ filename.endswith(".xlsx")) and not \ filename.startswith("~"): - filepath = os.path.join(inputdir, filename) extracted_file_path = os.path.join(extracted_files_dir, str(counter)) with ZipFile(filepath) as zipObj: zipObj.extractall(extracted_file_path) -counter += 1 +counter +=1 else: continue +# get key of value in dictionary +def get_key(val, dict): +for key, value in dict.items(): + if val == value: + return str(key) +return '' + +# replace curlybrace namespaces with the shorten ones +def replace_namespace_with_alias(filename, element): +namespaces = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])]) +i = element.find('}') +if i>=0: +element_ns = element[1:i] +element_ns_alias = get_key(element_ns, namespaces) +if element_ns_alias !='': +element = element.replace("{" + element_ns + "}", element_ns_alias + ":") +else: +element = element.replace("{" + element_ns + "}", "") +return element + # counts tags, attribute names and values of xmls def count_elements(extracted_files_dir, result_list): @@ -87,7 +106,7 @@ def count_elements(extracted_files_dir, result_list): # start to count for child in root.iter(): -tag = str(child.tag) +tag = replace_namespace_with_alias(xmlfile, child.tag) tag_idx = get_index_of_tag(tag, result_list) # count tags @@ -99,6 +118,7 @@ def count_elements(extracted_files_dir, result_list): # count attribute names and values of current tag for attr_name, attr_value in child.attrib.items(): +attr_name = replace_namespace_with_alias(xmlfile, attr_name) if not attr_name in result_list[tag_idx][1].keys(): result_list[tag_idx][1][attr_name] = 1 else: ___ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits