qa/document_analyser.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+)
New commits: commit 71ffc7eba9137e94a96b72fed762cc1c9a82baeb Author: Ahlaam Rafiq <[email protected]> AuthorDate: Sun Apr 4 13:17:48 2021 +0300 Commit: Ilmari Lauhakangas <[email protected]> CommitDate: Fri Sep 17 13:25:19 2021 +0200 tdf#124141 add document analyser Co-authored-by: Sebastian O. <[email protected]> Change-Id: Ie1e3474d020721538d6618addf7094b3307d9f5c Reviewed-on: https://gerrit.libreoffice.org/c/dev-tools/+/113567 Reviewed-by: Xisco Fauli <[email protected]> Reviewed-by: Ilmari Lauhakangas <[email protected]> Tested-by: Ilmari Lauhakangas <[email protected]> diff --git a/qa/document_analyser.py b/qa/document_analyser.py new file mode 100644 index 0000000..06bc98a --- /dev/null +++ b/qa/document_analyser.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# + +""" +Document analyser uses the odfpy module: https://pypi.org/project/odfpy/ + +This script prints: +bookmark count, cell count, changetracking count, character count, +comment count, draw count, frame count, hyperlink count, +image count, non-whitespace character count, object count, OLE object count, +page count, paragraph count, row count, sentence count, +syllable count, table count, textbox count, word count, and paragraph styles. + +""" + +import odf +from odf.namespaces import TEXTNS +from odf.element import Element +from odf.opendocument import load +from odf import text,meta,office,draw + + +print("Enter filename: ") +filename=input() + +doc=load(filename) + +#--------------------document statistics from the odf.meta module-------------------- +print("\nDOCUMENT STATISTICS\n") +for stat in doc.getElementsByType(meta.DocumentStatistic): + print("Cell count",stat.getAttribute('cellcount')) + print("Character count:",stat.getAttribute('charactercount')) + print("Draw count:",stat.getAttribute('drawcount')) + print("Frame count:",stat.getAttribute('framecount')) + print("Image count:",stat.getAttribute('imagecount')) + print("Non-whitespace character count:",stat.getAttribute('nonwhitespacecharactercount')) + print("Object count:",stat.getAttribute('objectcount')) + print("Object linking and embedding (OLE) object count:",stat.getAttribute('oleobjectcount')) + print("Page count:",stat.getAttribute('pagecount')) + print("Paragraph count:",stat.getAttribute('paragraphcount')) + print("Row count:",stat.getAttribute('rowcount')) + print("Sentence count:",stat.getAttribute('sentencecount')) + print("Syllable count:",stat.getAttribute('syllablecount')) + print("Table count:",stat.getAttribute('tablecount')) + print("Word count:",stat.getAttribute('wordcount')) + + +#--------------------type counter for attributes not covered by odf.meta.DocumentStatistic-------------------- +def type_counter(doc,type): + count=0 + for element in doc.getElementsByType(type): + count+=1 + return count + +types={ + 'Bookmark':text.Bookmark, + 'Changetracking':text.FormatChange, + 'Comment':office.Annotation, + 'Hyperlink':text.A, + 'Textbox':draw.TextBox +} + +for key,value in types.items(): + print(key,'count:',type_counter(doc,value)) + +#--------------------paragraph styles-------------------- +def paragraph_style(doc): + i = 1 + for paragraph in doc.getElementsByType(text.P): + print('Paragraph',i,'style:',paragraph.getAttribute('stylename')) + i+=1 + +paragraph_style(doc)
