XZise has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/213192

Change subject: [FEAT] pagefromfile: Simplified file structure
......................................................................

[FEAT] pagefromfile: Simplified file structure

Instead of requiring a start and end marker it now supports just a combined
marker (so that there can't be text between both markers). It also supports to
define the start marker automatically by the first line of the file.

Change-Id: Iff3e11c6237506805003409e13baf6f28f1fe8e8
---
M scripts/pagefromfile.py
1 file changed, 50 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/92/213192/1

diff --git a/scripts/pagefromfile.py b/scripts/pagefromfile.py
index 209355a..6e867d7 100755
--- a/scripts/pagefromfile.py
+++ b/scripts/pagefromfile.py
@@ -19,6 +19,9 @@
 
 -start:xxx      Specify the text that marks the beginning of a page
 -end:xxx        Specify the text that marks the end of a page
+-autostart      The start marker is the first line of the file (replaces 
-start)
+-startisstop    The start marker is also the endmarker for the previous page
+                (replaces -end)
 -file:xxx       Give the filename we are getting our material from
                 (default: dict.txt)
 -include        The beginning and end markers should be included
@@ -64,6 +67,7 @@
 
 import pywikibot
 from pywikibot import config, Bot, i18n
+from pywikibot.tools import deprecated
 
 
 class NoTitle(Exception):
@@ -190,6 +194,9 @@
         self.titleEndMarker = titleEndMarker
         self.include = include
         self.notitle = notitle
+        self._title_regex = re.compile('{0}(.*?){1}'.format(
+            re.escape(self.titleStartMarker),
+            re.escape(self.titleEndMarker)), re.DOTALL)
 
     def run(self):
         """Read file and yield page title and content."""
@@ -203,46 +210,65 @@
             pywikibot.output(str(err))
             raise IOError
 
+        if self.pageStartMarker is None:
+            self.pageStartMarker = text[:text.index('\n')]
+
+        page_regex = r'{0}(.*?)'
+        if self.pageEndMarker is None:
+            page_regex += '(?={0})'
+        else:
+            page_regex += '{1}'
+        page_regex = re.compile(page_regex.format(
+            re.escape(self.pageStartMarker),
+            re.escape(self.pageEndMarker or '')), re.DOTALL)
+
         position = 0
-        length = 0
-        while True:
+        for page_match in page_regex.finditer(text):
+            text_between = text[position:page_match.start()].strip()
+            if text_between:
+                pywikibot.warning('Found text between page markers: 
{0}'.format(
+                    text_between))
+            position = page_match.end()
             try:
-                length, title, contents = self.findpage(text[position:])
-            except AttributeError:
-                if not length:
-                    pywikibot.output(u'\nStart or end marker not found.')
-                else:
-                    pywikibot.output(u'End of file.')
-                break
-            except NoTitle as err:
-                pywikibot.output(u'\nNo title found - skipping a page.')
-                position += err.offset
-                continue
+                yield self._extract_information(page_match)
+            except NoTitle:
+                pywikibot.warning(
+                    'No title found for page in line {0}. Skipping.'.format(
+                        text.count('\n', 0, page_match.start()) + 1))
 
-            position += length
-            yield title, contents
+        if self.pageEndMarker is None:
+            # start marker is also end marker, so it wasn't matched
+            position += len(self.pageStartMarker)
+        rest_of_file = text[position:].strip()
+        if rest_of_file:
+            pywikibot.warning('Found text after the last page marker: 
{0}'.format(
+                rest_of_file))
 
+    @deprecated
     def findpage(self, text):
         """Find page to work on."""
         pageR = re.compile(re.escape(self.pageStartMarker) + "(.*?)" +
                            re.escape(self.pageEndMarker), re.DOTALL)
-        titleR = re.compile(re.escape(self.titleStartMarker) + "(.*?)" +
-                            re.escape(self.titleEndMarker))
 
         location = pageR.search(text)
+        title, contents = self._extract_location(location)
+        return location.end(), title, contents
+
+    def _extract_information(self, location):
+        """Return title and contents from the given match."""
         if self.include:
             contents = location.group()
         else:
             contents = location.group(1)
         try:
-            title = titleR.search(contents).group(1)
+            title = self._title_regex.search(contents).group(1)
             if self.notitle:
                 # Remove title (to allow creation of redirects)
-                contents = titleR.sub('', contents, count=1)
+                contents = self._title_regex.sub('', contents, count=1)
         except AttributeError:
             raise NoTitle(location.end())
         else:
-            return location.end(), title, contents
+            return title, contents
 
 
 def main(*args):
@@ -273,6 +299,10 @@
             pageStartMarker = arg[7:]
         elif arg.startswith("-end:"):
             pageEndMarker = arg[5:]
+        elif arg == '-autostart':
+            pageStartMarker = None
+        elif arg == '-startisstop':
+            pageEndMarker = None
         elif arg.startswith("-file:"):
             filename = arg[6:]
         elif arg == "-include":

-- 
To view, visit https://gerrit.wikimedia.org/r/213192
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iff3e11c6237506805003409e13baf6f28f1fe8e8
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to