XZise has uploaded a new change for review.
https://gerrit.wikimedia.org/r/213192
Change subject: [FEAT] pagefromfile: Simplified file structure
......................................................................
[FEAT] pagefromfile: Simplified file structure
Instead of requiring a start and end marker it now supports just a combined
marker (so that there can't be text between both markers). It also supports to
define the start marker automatically by the first line of the file.
Change-Id: Iff3e11c6237506805003409e13baf6f28f1fe8e8
---
M scripts/pagefromfile.py
1 file changed, 50 insertions(+), 20 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/92/213192/1
diff --git a/scripts/pagefromfile.py b/scripts/pagefromfile.py
index 209355a..6e867d7 100755
--- a/scripts/pagefromfile.py
+++ b/scripts/pagefromfile.py
@@ -19,6 +19,9 @@
-start:xxx Specify the text that marks the beginning of a page
-end:xxx Specify the text that marks the end of a page
+-autostart The start marker is the first line of the file (replaces
-start)
+-startisstop The start marker is also the endmarker for the previous page
+ (replaces -end)
-file:xxx Give the filename we are getting our material from
(default: dict.txt)
-include The beginning and end markers should be included
@@ -64,6 +67,7 @@
import pywikibot
from pywikibot import config, Bot, i18n
+from pywikibot.tools import deprecated
class NoTitle(Exception):
@@ -190,6 +194,9 @@
self.titleEndMarker = titleEndMarker
self.include = include
self.notitle = notitle
+ self._title_regex = re.compile('{0}(.*?){1}'.format(
+ re.escape(self.titleStartMarker),
+ re.escape(self.titleEndMarker)), re.DOTALL)
def run(self):
"""Read file and yield page title and content."""
@@ -203,46 +210,65 @@
pywikibot.output(str(err))
raise IOError
+ if self.pageStartMarker is None:
+ self.pageStartMarker = text[:text.index('\n')]
+
+ page_regex = r'{0}(.*?)'
+ if self.pageEndMarker is None:
+ page_regex += '(?={0})'
+ else:
+ page_regex += '{1}'
+ page_regex = re.compile(page_regex.format(
+ re.escape(self.pageStartMarker),
+ re.escape(self.pageEndMarker or '')), re.DOTALL)
+
position = 0
- length = 0
- while True:
+ for page_match in page_regex.finditer(text):
+ text_between = text[position:page_match.start()].strip()
+ if text_between:
+ pywikibot.warning('Found text between page markers:
{0}'.format(
+ text_between))
+ position = page_match.end()
try:
- length, title, contents = self.findpage(text[position:])
- except AttributeError:
- if not length:
- pywikibot.output(u'\nStart or end marker not found.')
- else:
- pywikibot.output(u'End of file.')
- break
- except NoTitle as err:
- pywikibot.output(u'\nNo title found - skipping a page.')
- position += err.offset
- continue
+ yield self._extract_information(page_match)
+ except NoTitle:
+ pywikibot.warning(
+ 'No title found for page in line {0}. Skipping.'.format(
+ text.count('\n', 0, page_match.start()) + 1))
- position += length
- yield title, contents
+ if self.pageEndMarker is None:
+ # start marker is also end marker, so it wasn't matched
+ position += len(self.pageStartMarker)
+ rest_of_file = text[position:].strip()
+ if rest_of_file:
+ pywikibot.warning('Found text after the last page marker:
{0}'.format(
+ rest_of_file))
+ @deprecated
def findpage(self, text):
"""Find page to work on."""
pageR = re.compile(re.escape(self.pageStartMarker) + "(.*?)" +
re.escape(self.pageEndMarker), re.DOTALL)
- titleR = re.compile(re.escape(self.titleStartMarker) + "(.*?)" +
- re.escape(self.titleEndMarker))
location = pageR.search(text)
+ title, contents = self._extract_location(location)
+ return location.end(), title, contents
+
+ def _extract_information(self, location):
+ """Return title and contents from the given match."""
if self.include:
contents = location.group()
else:
contents = location.group(1)
try:
- title = titleR.search(contents).group(1)
+ title = self._title_regex.search(contents).group(1)
if self.notitle:
# Remove title (to allow creation of redirects)
- contents = titleR.sub('', contents, count=1)
+ contents = self._title_regex.sub('', contents, count=1)
except AttributeError:
raise NoTitle(location.end())
else:
- return location.end(), title, contents
+ return title, contents
def main(*args):
@@ -273,6 +299,10 @@
pageStartMarker = arg[7:]
elif arg.startswith("-end:"):
pageEndMarker = arg[5:]
+ elif arg == '-autostart':
+ pageStartMarker = None
+ elif arg == '-startisstop':
+ pageEndMarker = None
elif arg.startswith("-file:"):
filename = arg[6:]
elif arg == "-include":
--
To view, visit https://gerrit.wikimedia.org/r/213192
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iff3e11c6237506805003409e13baf6f28f1fe8e8
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits