[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-10-19 Thread gulsahkose (via logerrit)
 bin/ooxml-analyze.py |  157 +--
 1 file changed, 141 insertions(+), 16 deletions(-)

New commits:
commit 001ab94b0af51d9908ce078b4f422a87ad79d971
Author: gulsahkose 
AuthorDate: Tue Oct 19 17:10:03 2021 +0300
Commit: gulsahkose 
CommitDate: Tue Oct 19 17:10:03 2021 +0300

Unparsable commit is sent to fetch local and remote repo.

Change-Id: I115d0097c14e192c0c93a8aabd186fc7e4296f23

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index cc7a7a036980..93150e6897f2 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import sys, getopt, os, shutil
+from typing import Type
 import xml.etree.ElementTree as ET
 from zipfile import ZipFile
 from lxml import etree
@@ -10,24 +11,27 @@ def main(argv):
 outputdir = ''
 extracted_files_dir_by_user = ''
 extracted_files_dir = ''
+fileformat = ''
 
 #read the arguments
 try:
-   opts, args = getopt.getopt(argv,"hi:o:e:",["idir=","odir="])
+   opts, args = getopt.getopt(argv,"hi:o:e:t:",["idir=","odir="])
 except getopt.GetoptError:
-   print ('analyze.py -i  -o ')
+   print ('analyze.py -i  -o  -t ')
sys.exit(2)
 
 for opt, arg in opts:
-   if opt == '-h':
-  print ('analyze.py -i  -o ')
-  sys.exit()
-   elif opt == '-e':
-  extracted_files_dir_by_user = arg
-   elif opt in ("-i", "--idir"):
-  inputdir = arg
-   elif opt in ("-o", "--odir"):
-  outputdir = arg
+if opt == '-h':
+print ('analyze.py -i  -o  -t ')
+sys.exit()
+elif opt == '-e':
+extracted_files_dir_by_user = arg
+elif opt in ("-i", "--idir"):
+inputdir = arg
+elif opt in ("-o", "--odir"):
+outputdir = arg
+elif opt == '-t':
+fileformat = arg
 
 if(extracted_files_dir_by_user == ''):
 # use default directory path for extracted ooxml files.
@@ -45,7 +49,14 @@ def main(argv):
 sub_texts_name = ext_dir[i+1:] + ".text"
 sub_result_list = []
 concatenated_texts_list = [] # holds concanated texts for each 
paragraph
-count_elements(ext_dir, sub_result_list, concatenated_texts_list)
+
+if fileformat == "pptx":
+count_pptx_elements(ext_dir, sub_result_list, 
concatenated_texts_list)
+elif fileformat == "xlsx":
+count_xlsx_elements(ext_dir, sub_result_list)
+else:
+print("File format is not supported")
+break
 
 sub_result_path = os.path.join(outputdir, sub_result_name)
 sub_texts_path = os.path.join(outputdir, sub_texts_name)
@@ -69,8 +80,8 @@ def main(argv):
 log_file.close()
 
 # no need to keep extracted files anymore.
-if(os.path.exists(extracted_files_dir)):
-shutil.rmtree(extracted_files_dir)
+#if(os.path.exists(extracted_files_dir)):
+#shutil.rmtree(extracted_files_dir)
 
 # unzip all ooxml files into the given path
 def extract_files(inputdir, extracted_files_dir):
@@ -80,6 +91,7 @@ def extract_files(inputdir, extracted_files_dir):
 shutil.rmtree(extracted_files_dir)
 
 # unzip files into the extracted files directory
+
 for filetype in get_list_of_subdir(inputdir):
 for filename in os.listdir(filetype):
 if (filename.endswith(".pptx") or   \
@@ -119,13 +131,126 @@ def replace_namespace_with_alias(filename, element):
 
 # decides which files should/shouldn't be analyzed.
 def is_file_in_accepted_files(filename):
-if(filename.endswith(".xml") and "ppt/slides/" in filename):
+if(filename.endswith(".xml") and ("ppt/slides/" in filename or 
"xl/worksheets" in filename)):
return True
 
 return False
 
+def read_shared_strings(shared_strings_list, shared_strings_path):
+tree = ET.parse(shared_strings_path)
+for child in tree.iter():
+if child.tag == 
'{http://schemas.openxmlformats.org/spreadsheetml/2006/main}t':
+shared_strings_list.append(child.text)
+
+def get_pivot_table_range(sheet_relation_path):
+tree = ET.parse(sheet_relation_path)
+for elem in tree.iter():
+if elem.tag == 
"{http://schemas.openxmlformats.org/package/2006/relationships}Relationship; 
and\
+   elem.attrib['Type'] == 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable":
+i = sheet_relation_path.rfind('/')
+pivot_table_path = os.path.join(sheet_relation_path[:i], ".." 
,elem.attrib['Target'])
+p_tree = ET.parse(pivot_table_path)
+for p_elem in p_tree.iter():
+if p_elem.tag == 
"{http://schemas.openxmlformats.org/spreadsheetml/2006/main}location; and \
+   p_elem.attrib['ref']:
+return p_elem.attrib['ref']
+return ''
+
+def is_cell_in_range(cell_id, cell_range):
+i = 

[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-07-20 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |  100 +--
 1 file changed, 42 insertions(+), 58 deletions(-)

New commits:
commit 18e89687fde3b3cfac00ead00cbefbb98262cdfe
Author: Gülşah Köse 
AuthorDate: Tue Jul 20 14:15:42 2021 +0300
Commit: Gülşah Köse 
CommitDate: Tue Jul 20 14:20:55 2021 +0300

remove namespace replacing and some small updates

Change-Id: I2d56668186c8745fca683025710646ae505a0d6b

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 87acd377c854..cc7a7a036980 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -80,21 +80,22 @@ def extract_files(inputdir, extracted_files_dir):
 shutil.rmtree(extracted_files_dir)
 
 # unzip files into the extracted files directory
-for filename in os.listdir(inputdir):
-if (filename.endswith(".pptx") or   \
-filename.endswith(".docx") or   \
-filename.endswith(".xlsx")) and not \
-filename.startswith("~"):
-filepath = os.path.join(inputdir, filename)
-extracted_file_path = os.path.join(extracted_files_dir, filename)
-
-try:
-with ZipFile(filepath) as zipObj:
-zipObj.extractall(extracted_file_path)
-except:
-print("%s is problematic" % filename)
-else:
-continue
+for filetype in get_list_of_subdir(inputdir):
+for filename in os.listdir(filetype):
+if (filename.endswith(".pptx") or   \
+filename.endswith(".docx") or   \
+filename.endswith(".xlsx")) and not \
+filename.startswith("~"):
+filepath = os.path.join(filetype, filename)
+extracted_file_path = os.path.join(extracted_files_dir, 
filename)
+
+try:
+with ZipFile(filepath) as zipObj:
+zipObj.extractall(extracted_file_path)
+except:
+print("%s is problematic" % filename)
+else:
+continue
 
 # get key of value in dictionary
 def get_key(val, dict):
@@ -116,34 +117,17 @@ def replace_namespace_with_alias(filename, element):
 element = element.replace("{" + element_ns + "}", "")
 return element
 
-# decides which files shouldn't be analyzed.
+# decides which files should/shouldn't be analyzed.
 def is_file_in_accepted_files(filename):
-if(filename.endswith("[Content_Types].xml") or \
-   filename.endswith("docProps/custom.xml") or \
-   filename.endswith("docProps/app.xml") or\
-   filename.endswith("presentation.xml") or \
-   filename.endswith("viewProps.xml") or \
-   filename.endswith("tableStyles.xml") or \
-   filename.endswith("presProps.xml") or \
-   "ppt/slideLayouts" in filename or \
-   "ppt/slideMasters" in filename or \
-   "ppt/theme" in filename or \
-   "ppt/notesMasters" in filename or \
-   "ppt/notesSlides" in filename or \
-   "ppt/handoutMasters" in filename or \
-   "ppt/tags" in filename or \
-   "pptx/customXml" in filename or \
-   "ppt/diagrams" in filename or \
-   filename.endswith("docProps/core.xml") or not \
-   filename.endswith(".xml")):
-   return False
-
-return True
+if(filename.endswith(".xml") and "ppt/slides/" in filename):
+   return True
+
+return False
 
 # counts tags, attribute names and values of xmls
 def count_elements(extracted_files_dir, result_list, concanated_texts_list):
 
-# make sure if extracted files directory exist
+# make sure if extracted files directory not exist
 if not (os.path.exists(extracted_files_dir)):
 print("Extracted files directory is not exist")
 return
@@ -160,7 +144,7 @@ def count_elements(extracted_files_dir, result_list, 
concanated_texts_list):
 try:
 # start to count
 for event, child in etree.iterparse(xmlfile, events=('start', 
'end')):
-tag = replace_namespace_with_alias(xmlfile, child.tag)
+tag = child.tag #replace_namespace_with_alias(xmlfile, 
child.tag)
 tag_idx = get_index_of_tag(tag, result_list)
 
 if event == "start":
@@ -171,30 +155,29 @@ def count_elements(extracted_files_dir, result_list, 
concanated_texts_list):
 else:
 result_list[tag_idx][0][tag] += 1
 
-# count attribute names and values of current tag
-for attr_name, attr_value in child.attrib.items():
-attr_name = replace_namespace_with_alias(xmlfile, 
attr_name)
-if not attr_name in result_list[tag_idx][1].keys():
-result_list[tag_idx][1][attr_name] = 1
-else:
-result_list[tag_idx][1][attr_name] +=1
-
-if not 

[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-06-07 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |  103 ---
 1 file changed, 57 insertions(+), 46 deletions(-)

New commits:
commit 7c03a4c092d9cba10ecb22e7f97aaca851259f1f
Author: Gülşah Köse 
AuthorDate: Tue Jun 8 08:43:30 2021 +0300
Commit: Gülşah Köse 
CommitDate: Tue Jun 8 08:43:30 2021 +0300

Improve error handling, exclude None texts

Change-Id: Idedad9c414311d95c355ea70a913f8e0ddf7

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index a7e2bc2a549f..87acd377c854 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -88,8 +88,11 @@ def extract_files(inputdir, extracted_files_dir):
 filepath = os.path.join(inputdir, filename)
 extracted_file_path = os.path.join(extracted_files_dir, filename)
 
-with ZipFile(filepath) as zipObj:
-zipObj.extractall(extracted_file_path)
+try:
+with ZipFile(filepath) as zipObj:
+zipObj.extractall(extracted_file_path)
+except:
+print("%s is problematic" % filename)
 else:
 continue
 
@@ -126,6 +129,11 @@ def is_file_in_accepted_files(filename):
"ppt/slideMasters" in filename or \
"ppt/theme" in filename or \
"ppt/notesMasters" in filename or \
+   "ppt/notesSlides" in filename or \
+   "ppt/handoutMasters" in filename or \
+   "ppt/tags" in filename or \
+   "pptx/customXml" in filename or \
+   "ppt/diagrams" in filename or \
filename.endswith("docProps/core.xml") or not \
filename.endswith(".xml")):
return False
@@ -149,51 +157,54 @@ def count_elements(extracted_files_dir, result_list, 
concanated_texts_list):
 
 print(xmlfile)
 
-# start to count
-for event, child in etree.iterparse(xmlfile, events=('start', 'end')):
-tag = replace_namespace_with_alias(xmlfile, child.tag)
-tag_idx = get_index_of_tag(tag, result_list)
-
-if event == "start":
-# count tags
-if (tag_idx == -1):
-tmp_list = [{tag: 1},{},{},{}]
-result_list.append(tmp_list)
-else:
-result_list[tag_idx][0][tag] += 1
-
-# count attribute names and values of current tag
-for attr_name, attr_value in child.attrib.items():
-attr_name = replace_namespace_with_alias(xmlfile, 
attr_name)
-if not attr_name in result_list[tag_idx][1].keys():
-result_list[tag_idx][1][attr_name] = 1
+try:
+# start to count
+for event, child in etree.iterparse(xmlfile, events=('start', 
'end')):
+tag = replace_namespace_with_alias(xmlfile, child.tag)
+tag_idx = get_index_of_tag(tag, result_list)
+
+if event == "start":
+# count tags
+if (tag_idx == -1):
+tmp_list = [{tag: 1},{},{},{}]
+result_list.append(tmp_list)
 else:
-result_list[tag_idx][1][attr_name] +=1
-
-if not attr_value in result_list[tag_idx][2].keys():
-result_list[tag_idx][2][attr_value] = 1
-else:
-result_list[tag_idx][2][attr_value] +=1
-
-# concanated text will be resetted in every paragraph begining
-if tag == "a:p":
-concatenated_text = ""
-
-
-if event == "end":
-# Detect seperate texts in paragraph and concanate them.
-if tag == "a:t":
-concatenated_text += str(child.text)
-# End of the paragraph element, add the text as list item.
-if tag == "a:p" and concatenated_text != "":
-concanated_texts_list.append(concatenated_text)
-
-# count text contents except consisted of whitespaces.
-if not (str(child.text) == "None" or 
str(child.text).strip()==""):
-if not child.text in result_list[tag_idx][3].keys():
-result_list[tag_idx][3][child.text] = 1
-else:
-result_list[tag_idx][3][child.text] += 1
+result_list[tag_idx][0][tag] += 1
+
+# count attribute names and values of current tag
+for attr_name, attr_value in child.attrib.items():
+attr_name = replace_namespace_with_alias(xmlfile, 
attr_name)
+if not attr_name in result_list[tag_idx][1].keys():
+result_list[tag_idx][1][attr_name] = 1
+else:
+result_list[tag_idx][1][attr_name] +=1
+
+if not 

[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-06-03 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |   99 ---
 1 file changed, 64 insertions(+), 35 deletions(-)

New commits:
commit f2bde987693fad6e1347f99e34c2ad5291ea8ee6
Author: Gülşah Köse 
AuthorDate: Thu Jun 3 14:11:05 2021 +0300
Commit: Gülşah Köse 
CommitDate: Thu Jun 3 14:15:14 2021 +0300

Concanate seperate texts runs and create seperate result file fot it.

For eg:

  
text1
  
  
text2
  


We will keep the result text as "text1text2"

As result we will create .text to hold that type texts.

Change-Id: I946af39e2037db1f986e73039d0a462a36bba1d8

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 9db39d8c47da..a7e2bc2a549f 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -1,8 +1,9 @@
 #!/usr/bin/python
 
-import sys, getopt, os, shutil, pprint
+import sys, getopt, os, shutil
 import xml.etree.ElementTree as ET
 from zipfile import ZipFile
+from lxml import etree
 
 def main(argv):
 inputdir = ''
@@ -28,9 +29,6 @@ def main(argv):
elif opt in ("-o", "--odir"):
   outputdir = arg
 
-# holds the result structer of analyze
-result_list = []
-
 if(extracted_files_dir_by_user == ''):
 # use default directory path for extracted ooxml files.
 extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
@@ -40,22 +38,39 @@ def main(argv):
 extracted_files_dir = extracted_files_dir_by_user
 
 # create seperate result files for each ooxml document as .result in output directory
+# create seperate concanated texts for each ooxml document as .text in output directory
 for ext_dir in get_list_of_subdir(extracted_files_dir):
 i = ext_dir.rfind('/')
 sub_result_name = ext_dir[i+1:] + ".result"
+sub_texts_name = ext_dir[i+1:] + ".text"
 sub_result_list = []
-count_elements(ext_dir, sub_result_list)
+concatenated_texts_list = [] # holds concanated texts for each 
paragraph
+count_elements(ext_dir, sub_result_list, concatenated_texts_list)
+
 sub_result_path = os.path.join(outputdir, sub_result_name)
+sub_texts_path = os.path.join(outputdir, sub_texts_name)
 
 # sort the result sub list according to tag names
 sub_result_list = sorted(sub_result_list, key=lambda x: 
list(x[0].keys())[0], reverse=False)
+concatenated_texts_list.sort()
 
 if os.path.exists(sub_result_path):
 os.remove(sub_result_path)
+if os.path.exists(sub_texts_path):
+os.remove(sub_texts_path)
+
 for i in sub_result_list:
 with open(sub_result_path, "a") as log_file:
 print(i, file=log_file)
-log_file.close()
+log_file.close()
+for i in concatenated_texts_list:
+with open(sub_texts_path, "a") as log_file:
+print(i, file=log_file)
+log_file.close()
+
+# no need to keep extracted files anymore.
+if(os.path.exists(extracted_files_dir)):
+shutil.rmtree(extracted_files_dir)
 
 # unzip all ooxml files into the given path
 def extract_files(inputdir, extracted_files_dir):
@@ -98,6 +113,7 @@ def replace_namespace_with_alias(filename, element):
 element = element.replace("{" + element_ns + "}", "")
 return element
 
+# decides which files shouldn't be analyzed.
 def is_file_in_accepted_files(filename):
 if(filename.endswith("[Content_Types].xml") or \
filename.endswith("docProps/custom.xml") or \
@@ -109,6 +125,7 @@ def is_file_in_accepted_files(filename):
"ppt/slideLayouts" in filename or \
"ppt/slideMasters" in filename or \
"ppt/theme" in filename or \
+   "ppt/notesMasters" in filename or \
filename.endswith("docProps/core.xml") or not \
filename.endswith(".xml")):
return False
@@ -116,7 +133,7 @@ def is_file_in_accepted_files(filename):
 return True
 
 # counts tags, attribute names and values of xmls
-def count_elements(extracted_files_dir, result_list):
+def count_elements(extracted_files_dir, result_list, concanated_texts_list):
 
 # make sure if extracted files directory exist
 if not (os.path.exists(extracted_files_dir)):
@@ -131,40 +148,52 @@ def count_elements(extracted_files_dir, result_list):
 continue
 
 print(xmlfile)
-tree = ET.parse(xmlfile)
-root = tree.getroot()
 
 # start to count
-for child in root.iter():
+for event, child in etree.iterparse(xmlfile, events=('start', 'end')):
 tag = replace_namespace_with_alias(xmlfile, child.tag)
 tag_idx = get_index_of_tag(tag, result_list)
 
-# count tags
-if (tag_idx == -1):
-tmp_list = [{tag: 1},{},{},{}]
-result_list.append(tmp_list)
-else:
-result_list[tag_idx][0][tag] += 1
-
-

[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-05-31 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |   38 ++
 1 file changed, 22 insertions(+), 16 deletions(-)

New commits:
commit 056ebfae35f6725b9089439a7bf868dad48fdd0f
Author: Gülşah Köse 
AuthorDate: Mon May 31 16:08:23 2021 +0300
Commit: Gülşah Köse 
CommitDate: Mon May 31 16:08:28 2021 +0300

Fix the use of exist extracted files path and change result output

Tool was counting the text context that consists of whitespaces.
Prevent this, not count that texts as text contexts eg:  " "

Change-Id: Ib71123b82082166addd423b734661a158ec2254e

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 12b9ba590db9..9db39d8c47da 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -8,6 +8,7 @@ def main(argv):
 inputdir = ''
 outputdir = ''
 extracted_files_dir_by_user = ''
+extracted_files_dir = ''
 
 #read the arguments
 try:
@@ -34,23 +35,27 @@ def main(argv):
 # use default directory path for extracted ooxml files.
 extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
 extract_files(inputdir, extracted_files_dir)
-
-# create seperate result files for each ooxml document as .result in output directory
-for ext_dir in get_list_of_subdir(extracted_files_dir):
-i = ext_dir.rfind('/')
-sub_result_name = ext_dir[i+1:] + ".result"
-sub_result_list = []
-count_elements(ext_dir, sub_result_list)
-sub_result_path = os.path.join(outputdir, sub_result_name)
-
-# sort the result sub list according to tag names
-sub_result_list = sorted(sub_result_list, key=lambda x: 
list(x[0].keys())[0], reverse=False)
-
-with open(sub_result_path, "w") as log_file:
-pprint.pprint(sub_result_list, log_file)
 else:
 # use user defined directory path for extracted ooxml files.
-count_elements(extracted_files_dir_by_user, result_list)
+extracted_files_dir = extracted_files_dir_by_user
+
+# create seperate result files for each ooxml document as .result in output directory
+for ext_dir in get_list_of_subdir(extracted_files_dir):
+i = ext_dir.rfind('/')
+sub_result_name = ext_dir[i+1:] + ".result"
+sub_result_list = []
+count_elements(ext_dir, sub_result_list)
+sub_result_path = os.path.join(outputdir, sub_result_name)
+
+# sort the result sub list according to tag names
+sub_result_list = sorted(sub_result_list, key=lambda x: 
list(x[0].keys())[0], reverse=False)
+
+if os.path.exists(sub_result_path):
+os.remove(sub_result_path)
+for i in sub_result_list:
+with open(sub_result_path, "a") as log_file:
+print(i, file=log_file)
+log_file.close()
 
 # unzip all ooxml files into the given path
 def extract_files(inputdir, extracted_files_dir):
@@ -154,7 +159,8 @@ def count_elements(extracted_files_dir, result_list):
 else:
 result_list[tag_idx][2][attr_value] +=1
 
-if not (str(child.text) == "None"):
+# count text contents except consisted of whitespaces.
+if not (str(child.text) == "None" or str(child.text).strip()==""):
 if not child.text in result_list[tag_idx][3].keys():
 result_list[tag_idx][3][child.text] = 1
 else:
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits


[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-05-26 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |   90 ---
 1 file changed, 57 insertions(+), 33 deletions(-)

New commits:
commit fc03e6b942a9170bda5964f95893c18123b340e4
Author: Gülşah Köse 
AuthorDate: Wed May 26 18:25:11 2021 +0300
Commit: Gülşah Köse 
CommitDate: Wed May 26 18:25:11 2021 +0300

Export the accepted files part as function. And sort the sub result list

Change-Id: I9b5c003b6363ac50cf7c838cc4e954c14ef935de

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 8dbfe8cacd0b..12b9ba590db9 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -42,6 +42,10 @@ def main(argv):
 sub_result_list = []
 count_elements(ext_dir, sub_result_list)
 sub_result_path = os.path.join(outputdir, sub_result_name)
+
+# sort the result sub list according to tag names
+sub_result_list = sorted(sub_result_list, key=lambda x: 
list(x[0].keys())[0], reverse=False)
+
 with open(sub_result_path, "w") as log_file:
 pprint.pprint(sub_result_list, log_file)
 else:
@@ -89,6 +93,23 @@ def replace_namespace_with_alias(filename, element):
 element = element.replace("{" + element_ns + "}", "")
 return element
 
+def is_file_in_accepted_files(filename):
+if(filename.endswith("[Content_Types].xml") or \
+   filename.endswith("docProps/custom.xml") or \
+   filename.endswith("docProps/app.xml") or\
+   filename.endswith("presentation.xml") or \
+   filename.endswith("viewProps.xml") or \
+   filename.endswith("tableStyles.xml") or \
+   filename.endswith("presProps.xml") or \
+   "ppt/slideLayouts" in filename or \
+   "ppt/slideMasters" in filename or \
+   "ppt/theme" in filename or \
+   filename.endswith("docProps/core.xml") or not \
+   filename.endswith(".xml")):
+   return False
+
+return True
+
 # counts tags, attribute names and values of xmls
 def count_elements(extracted_files_dir, result_list):
 
@@ -101,40 +122,43 @@ def count_elements(extracted_files_dir, result_list):
 
 # parse xmls and count elements
 for xmlfile in list_of_files:
-if(xmlfile.endswith(".xml")):
-tree = ET.parse(xmlfile)
-root = tree.getroot()
-
-# start to count
-for child in root.iter():
-tag = replace_namespace_with_alias(xmlfile, child.tag)
-tag_idx = get_index_of_tag(tag, result_list)
-
-# count tags
-if (tag_idx == -1):
-tmp_list = [{tag: 1},{},{},{}]
-result_list.append(tmp_list)
+if not is_file_in_accepted_files(xmlfile):
+continue
+
+print(xmlfile)
+tree = ET.parse(xmlfile)
+root = tree.getroot()
+
+# start to count
+for child in root.iter():
+tag = replace_namespace_with_alias(xmlfile, child.tag)
+tag_idx = get_index_of_tag(tag, result_list)
+
+# count tags
+if (tag_idx == -1):
+tmp_list = [{tag: 1},{},{},{}]
+result_list.append(tmp_list)
+else:
+result_list[tag_idx][0][tag] += 1
+
+# count attribute names and values of current tag
+for attr_name, attr_value in child.attrib.items():
+attr_name = replace_namespace_with_alias(xmlfile, attr_name)
+if not attr_name in result_list[tag_idx][1].keys():
+result_list[tag_idx][1][attr_name] = 1
+else:
+result_list[tag_idx][1][attr_name] +=1
+
+if not attr_value in result_list[tag_idx][2].keys():
+result_list[tag_idx][2][attr_value] = 1
+else:
+result_list[tag_idx][2][attr_value] +=1
+
+if not (str(child.text) == "None"):
+if not child.text in result_list[tag_idx][3].keys():
+result_list[tag_idx][3][child.text] = 1
 else:
-result_list[tag_idx][0][tag] += 1
-
-# count attribute names and values of current tag
-for attr_name, attr_value in child.attrib.items():
-attr_name = replace_namespace_with_alias(xmlfile, 
attr_name)
-if not attr_name in result_list[tag_idx][1].keys():
-result_list[tag_idx][1][attr_name] = 1
-else:
-result_list[tag_idx][1][attr_name] +=1
-
-if not attr_value in result_list[tag_idx][2].keys():
-result_list[tag_idx][2][attr_value] = 1
-else:
-result_list[tag_idx][2][attr_value] +=1
-
-if not (str(child.text) == "None"):
-if not child.text in result_list[tag_idx][3].keys():
-

[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-05-26 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |   31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

New commits:
commit a8b521dc0f8e810f97630551406ccd8d1590371f
Author: Gülşah Köse 
AuthorDate: Wed May 26 16:57:23 2021 +0300
Commit: Gülşah Köse 
CommitDate: Wed May 26 16:57:23 2021 +0300

Create a sub result for each ooxml document in output directory

Change-Id: Ibbb366725d344f8e44c085ced60c35e190f98a9d

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 3f9b0e8bdad1..8dbfe8cacd0b 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -35,13 +35,19 @@ def main(argv):
 extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
 extract_files(inputdir, extracted_files_dir)
 
-count_elements(extracted_files_dir, result_list)
+# create seperate result files for each ooxml document as .result in output directory
+for ext_dir in get_list_of_subdir(extracted_files_dir):
+i = ext_dir.rfind('/')
+sub_result_name = ext_dir[i+1:] + ".result"
+sub_result_list = []
+count_elements(ext_dir, sub_result_list)
+sub_result_path = os.path.join(outputdir, sub_result_name)
+with open(sub_result_path, "w") as log_file:
+pprint.pprint(sub_result_list, log_file)
 else:
 # use user defined directory path for extracted ooxml files.
 count_elements(extracted_files_dir_by_user, result_list)
 
-pprint.pprint(result_list)
-
 # unzip all ooxml files into the given path
 def extract_files(inputdir, extracted_files_dir):
 
@@ -49,9 +55,6 @@ def extract_files(inputdir, extracted_files_dir):
 if(os.path.exists(extracted_files_dir)):
 shutil.rmtree(extracted_files_dir)
 
-# holds directory names for each ooxml document in extracted files dir.
-counter = 1
-
 # unzip files into the extracted files directory
 for filename in os.listdir(inputdir):
 if (filename.endswith(".pptx") or   \
@@ -59,12 +62,10 @@ def extract_files(inputdir, extracted_files_dir):
 filename.endswith(".xlsx")) and not \
 filename.startswith("~"):
 filepath = os.path.join(inputdir, filename)
-extracted_file_path = os.path.join(extracted_files_dir, 
str(counter))
+extracted_file_path = os.path.join(extracted_files_dir, filename)
 
 with ZipFile(filepath) as zipObj:
 zipObj.extractall(extracted_file_path)
-
-counter +=1
 else:
 continue
 
@@ -158,5 +159,17 @@ def get_list_of_files(directory_name):
 
 return all_files
 
+def get_list_of_subdir(directory_name):
+
+list_of_file = os.listdir(directory_name)
+subdirs = list()
+
+for filename in list_of_file:
+full_path = os.path.join(directory_name, filename)
+if os.path.isdir(full_path):
+subdirs.append(full_path)
+
+return subdirs
+
 if __name__ == "__main__":
 main(sys.argv[1:])
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits


[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

2021-05-26 Thread Gülşah Köse (via logerrit)
 bin/ooxml-analyze.py |   28 
 1 file changed, 24 insertions(+), 4 deletions(-)

New commits:
commit dbb7762b1235ae245fd5b67046737edf5519fbd9
Author: Gülşah Köse 
AuthorDate: Wed May 26 16:47:12 2021 +0300
Commit: Gülşah Köse 
CommitDate: Wed May 26 16:47:12 2021 +0300

Replace namespaces with namespace aliases on result

Change-Id: If29c0b5d9eb52a7d42a1d1482010653d2714c8fe

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index efc44bbfa32c..3f9b0e8bdad1 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -33,8 +33,8 @@ def main(argv):
 if(extracted_files_dir_by_user == ''):
 # use default directory path for extracted ooxml files.
 extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
-
 extract_files(inputdir, extracted_files_dir)
+
 count_elements(extracted_files_dir, result_list)
 else:
 # use user defined directory path for extracted ooxml files.
@@ -58,17 +58,36 @@ def extract_files(inputdir, extracted_files_dir):
 filename.endswith(".docx") or   \
 filename.endswith(".xlsx")) and not \
 filename.startswith("~"):
-
 filepath = os.path.join(inputdir, filename)
 extracted_file_path = os.path.join(extracted_files_dir, 
str(counter))
 
 with ZipFile(filepath) as zipObj:
 zipObj.extractall(extracted_file_path)
 
-counter += 1
+counter +=1
 else:
 continue
 
+# get key of value in dictionary
+def get_key(val, dict):
+for key, value in dict.items():
+ if val == value:
+ return str(key)
+return ''
+
+# replace curlybrace namespaces with the shorten ones
+def replace_namespace_with_alias(filename, element):
+namespaces = dict([node for _, node in ET.iterparse(filename, 
events=['start-ns'])])
+i = element.find('}')
+if i>=0:
+element_ns = element[1:i]
+element_ns_alias = get_key(element_ns, namespaces)
+if element_ns_alias !='':
+element = element.replace("{" + element_ns + "}", element_ns_alias 
+ ":")
+else:
+element = element.replace("{" + element_ns + "}", "")
+return element
+
 # counts tags, attribute names and values of xmls
 def count_elements(extracted_files_dir, result_list):
 
@@ -87,7 +106,7 @@ def count_elements(extracted_files_dir, result_list):
 
 # start to count
 for child in root.iter():
-tag = str(child.tag)
+tag = replace_namespace_with_alias(xmlfile, child.tag)
 tag_idx = get_index_of_tag(tag, result_list)
 
 # count tags
@@ -99,6 +118,7 @@ def count_elements(extracted_files_dir, result_list):
 
 # count attribute names and values of current tag
 for attr_name, attr_value in child.attrib.items():
+attr_name = replace_namespace_with_alias(xmlfile, 
attr_name)
 if not attr_name in result_list[tag_idx][1].keys():
 result_list[tag_idx][1][attr_name] = 1
 else:
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits