4 new revisions:

Revision: 273f498899d2
Branch:   default
Author:   Pekka Klärck
Date:     Sun Oct 21 13:30:38 2012
Log:      proto/profiler.py: added help text
http://code.google.com/p/robotframework/source/detail?r=273f498899d2

Revision: b9528ed87b97
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 13:01:36 2012
Log: parsing: Consider non-breaking space as normal space when parsing TSV ...
http://code.google.com/p/robotframework/source/detail?r=b9528ed87b97

Revision: e267f4f982f7
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 13:19:14 2012
Log:      parsing: Consider NBSP as normal space also when parsing HTML....
http://code.google.com/p/robotframework/source/detail?r=e267f4f982f7

Revision: 01eac700dbe5
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 14:20:22 2012
Log:      htmlreader: Refactored the code and fixed handlig ˜...
http://code.google.com/p/robotframework/source/detail?r=01eac700dbe5

==============================================================================
Revision: 273f498899d2
Branch:   default
Author:   Pekka Klärck
Date:     Sun Oct 21 13:30:38 2012
Log:      proto/profiler.py: added help text
http://code.google.com/p/robotframework/source/detail?r=273f498899d2

Modified:
 /proto/profiler.py

=======================================
--- /proto/profiler.py  Sat Oct 20 12:05:57 2012
+++ /proto/profiler.py  Sun Oct 21 13:30:38 2012
@@ -1,3 +1,10 @@
+#!/usr/bin/env python
+
+"""Profiler for Robot Framework `run` and `rebot`.
+
+Usage: profiler.py run|rebot [options] arguments
+"""
+
 import cProfile
 import pstats
 import os
@@ -11,14 +18,20 @@
 from robot.run import run_cli
 from robot.rebot import rebot_cli

-if sys.argv[1] != 'rebot':
-    profiled = 'run_cli(sys.argv[1:])'
-else:
-    profiled = 'rebot_cli(sys.argv[2:])'

-results = tempfile.mktemp(suffix='.out', prefix='pybot-profile',
+def profile(profiled):
+    results = tempfile.mktemp(suffix='.out', prefix='pybot-profile',
                           dir=join(rootdir, 'tmp'))
-cProfile.run(profiled, results)
-stats = pstats.Stats(results)
-stats.sort_stats('cumulative').print_stats(50)
-os.remove(results)
+    cProfile.run(profiled, results)
+    stats = pstats.Stats(results)
+    stats.sort_stats('cumulative').print_stats(50)
+    os.remove(results)
+
+
+if __name__ == '__main__':
+    try:
+        profiled = {'run': 'run_cli(sys.argv[2:])',
+                    'rebot': 'rebot_cli(sys.argv[2:])'}[sys.argv[1]]
+    except (IndexError, KeyError):
+        sys.exit(__doc__)
+    profile(profiled)

==============================================================================
Revision: b9528ed87b97
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 13:01:36 2012
Log: parsing: Consider non-breaking space as normal space when parsing TSV and TXT.

Update issue 1264
Status: Started
Owner: pekka.klarck
Labels: Targe-2.7.5, bwic, ackn
Applied the patch Eemeli and I created as-is otherwise but fixed stripping possible BOM.

Still need to handle HTML format and update User Guide accordingly.
http://code.google.com/p/robotframework/source/detail?r=b9528ed87b97

Added:
 /atest/robot/parsing/non_breaking_space.txt
 /atest/testdata/parsing/nbsp.tsv
 /atest/testdata/parsing/nbsp.txt
Modified:
 /src/robot/parsing/tsvreader.py
 /src/robot/parsing/txtreader.py

=======================================
--- /dev/null
+++ /atest/robot/parsing/non_breaking_space.txt Tue Oct 23 13:01:36 2012
@@ -0,0 +1,17 @@
+*** Settings ***
+Documentation  Regard non-breaking spaces as normal spaces in parsing
+Suite Setup    Run Tests  ${EMPTY}  parsing/nbsp.txt  parsing/nbsp.tsv
+Force Tags     regression   pybot  jybot
+Resource       atest_resource.txt
+
+
+*** Test Cases ***
+
+Non-breaking spaces in plain text file
+    Check test case  ${TESTNAME}
+
+Non-breaking spaces in plain text file with pipes
+    Check test case  ${TESTNAME}
+
+Non-breaking spaces in TSV file
+    Check test case  ${TESTNAME}
=======================================
--- /dev/null
+++ /atest/testdata/parsing/nbsp.tsv    Tue Oct 23 13:01:36 2012
@@ -0,0 +1,3 @@
+*** Test cases ***
+Non-breaking spaces in TSV file
+           Should Be Equal        NBSPs only in first           
NBSPs only in first
=======================================
--- /dev/null
+++ /atest/testdata/parsing/nbsp.txt    Tue Oct 23 13:01:36 2012
@@ -0,0 +1,6 @@
+*** Test cases ***
+Non-breaking spaces in plain text file
+    Should Be Equal    NBSPs only in first    NBSPs only in first
+
+| Non-breaking spaces in plain text file with pipes |
+|  | Should Be Equal | NBSPs only in first | NBSPs only in first
=======================================
--- /src/robot/parsing/tsvreader.py     Tue Mar  6 00:46:30 2012
+++ /src/robot/parsing/tsvreader.py     Tue Oct 23 13:01:36 2012
@@ -15,13 +15,15 @@
 from codecs import BOM_UTF8


+NBSP = u'\xA0'
+
+
 class TsvReader:

     def read(self, tsvfile, populator):
         process = False
         for index, row in enumerate(tsvfile.readlines()):
-            if index == 0 and row.startswith(BOM_UTF8):
-                row = row[len(BOM_UTF8):]
+            row = self._decode_row(row, index == 0)
             cells = [self._process(cell) for cell in self.split_row(row)]
             name = cells and cells[0].strip() or ''
             if name.startswith('*') and \
@@ -31,6 +33,14 @@
                 populator.add(cells)
         populator.eof()

+    def _decode_row(self, row, is_first):
+        if is_first and row.startswith(BOM_UTF8):
+            row = row[len(BOM_UTF8):]
+        row = row.decode('UTF-8')
+        if NBSP in row:
+            row = row.replace(NBSP, ' ')
+        return row
+
     @classmethod
     def split_row(cls, row):
         return row.rstrip().split('\t')
@@ -38,4 +48,4 @@
     def _process(self, cell):
         if len(cell) > 1 and cell[0] == cell[-1] == '"':
             cell = cell[1:-1].replace('""','"')
-        return cell.decode('UTF-8')
+        return cell
=======================================
--- /src/robot/parsing/txtreader.py     Tue Mar  6 00:46:30 2012
+++ /src/robot/parsing/txtreader.py     Tue Oct 23 13:01:36 2012
@@ -30,4 +30,4 @@
         return [cell.strip() for cell in cls._pipe_splitter.split(row)]

     def _process(self, cell):
-        return cell.decode('UTF-8')
+        return cell

==============================================================================
Revision: e267f4f982f7
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 13:19:14 2012
Log:      parsing: Consider NBSP as normal space also when parsing HTML.

Updata issue 1264
Now HTML is handled the same way as TXT and TSV.

Noticed that HtmlReader could be cleaned up a little (and possibly performance enhanced at the same time) but that's not related to this issue.
http://code.google.com/p/robotframework/source/detail?r=e267f4f982f7

Added:
 /atest/testdata/parsing/nbsp.html
Modified:
 /atest/robot/parsing/non_breaking_space.txt
 /src/robot/parsing/htmlreader.py

=======================================
--- /dev/null
+++ /atest/testdata/parsing/nbsp.html   Tue Oct 23 13:19:14 2012
@@ -0,0 +1,22 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>NBSP</title>
+</head>
+<body>
+<h1>NBSP</h1>
+<table border="1">
+<tr>
+<th colspan="5">  Test cases  </th>
+</tr>
+<tr>
+<td>  Non-breaking spaces in HTML file  </td>
+<td>  Should Be Equal  </td>
+<td>  NBSPs only in first  </td>
+<td>  NBSPs only in first  </td>
+<td></td>
+</tr>
+</table>
+</body>
+</html>
=======================================
--- /atest/robot/parsing/non_breaking_space.txt Tue Oct 23 13:01:36 2012
+++ /atest/robot/parsing/non_breaking_space.txt Tue Oct 23 13:19:14 2012
@@ -1,6 +1,6 @@
 *** Settings ***
 Documentation  Regard non-breaking spaces as normal spaces in parsing
-Suite Setup    Run Tests  ${EMPTY}  parsing/nbsp.txt  parsing/nbsp.tsv
+Suite Setup    Run Tests  ${EMPTY}  parsing/nbsp.*
 Force Tags     regression   pybot  jybot
 Resource       atest_resource.txt

@@ -15,3 +15,6 @@

 Non-breaking spaces in TSV file
     Check test case  ${TESTNAME}
+
+Non-breaking spaces in HTML file
+    Check test case  ${TESTNAME}
=======================================
--- /src/robot/parsing/htmlreader.py    Tue Mar  6 00:46:30 2012
+++ /src/robot/parsing/htmlreader.py    Tue Oct 23 13:19:14 2012
@@ -18,6 +18,7 @@
 from htmlentitydefs import entitydefs

 extra_entitydefs = {'nbsp': ' ',  'apos': "'", 'tilde': '~'}
+NON_BREAKING_SPACE = u'\xA0'


 class HtmlReader(HTMLParser.HTMLParser):
@@ -67,6 +68,8 @@
             return
         if decode:
             data = data.decode(self._encoding)
+        if NON_BREAKING_SPACE in data:
+            data = data.replace(NON_BREAKING_SPACE, ' ')
         self.current_cell.append(data)

     def handle_entityref(self, name):

==============================================================================
Revision: 01eac700dbe5
Branch:   default
Author:   Pekka Klärck
Date:     Tue Oct 23 14:20:22 2012
Log:      htmlreader: Refactored the code and fixed handlig &tilde;

Update issue 1265
Status: Done
Fixed.
http://code.google.com/p/robotframework/source/detail?r=01eac700dbe5

Modified:
 /atest/robot/parsing/html_entityrefs.txt
 /atest/testdata/parsing/html_entityrefs.html
 /atest/testdata/parsing/html_entityrefs_variables.py
 /src/robot/parsing/htmlreader.py

=======================================
--- /atest/robot/parsing/html_entityrefs.txt    Wed Aug 31 12:24:58 2011
+++ /atest/robot/parsing/html_entityrefs.txt    Tue Oct 23 14:20:22 2012
@@ -6,18 +6,18 @@

 *** Test Cases ***
 Scandinavian Letters
-    Check Test Case  Scandinavian Letters
+    Check Test Case    ${TEST NAME}

 XML Escapes
-    Check Test Case  XML Escapes
+    Check Test Case    ${TEST NAME}

 Other Escapes
-    Check Test Case  Other Escapes
+    Check Test Case    ${TEST NAME}

 Numerical Escapes
     [Documentation]  These are character references
-    Check Test Case  Numerical Escapes
+    Check Test Case    ${TEST NAME}

 Variables using escapes
-    Check Test Case  Variables using escapes
+    Check Test Case    ${TEST NAME}

=======================================
--- /atest/testdata/parsing/html_entityrefs.html        Tue Oct 18 02:22:17 2011
+++ /atest/testdata/parsing/html_entityrefs.html        Tue Oct 23 14:20:22 2012
@@ -150,8 +150,7 @@
 <tr>
 <td>XML Escapes</td>
 <td>Should Be Equal</td>
-<td>&amp; &amp;amp; &lt; &lt;tag&gt;
-&gt; ' " '" ' &amp;gt;<br></td>
+<td>&amp; &amp;amp; &lt;tag&gt; &apos; &quot; &amp;gt;<br></td>
 <td>${XML ESCAPES}</td>
 <td></td>
 </tr>
@@ -165,7 +164,7 @@
 <tr>
 <td>Other&nbsp;Escapes</td>
 <td>Should Be Equal</td>
-<td>&sect;xxx&sect; &tilde; &apos;&nbsp;&quot;</td>
+<td>&sect;xxx &tilde;&nbsp;~</td>
 <td>${OTHER ESCAPES}</td>
 <td></td>
 </tr>
@@ -219,32 +218,6 @@
 <td></td>
 <td></td>
 </tr>
-</tbody>
-</table>
-<table border="1">
-<colgroup span="99"><col class="name"><col class="action"><col class="arg" span="3"></colgroup>
-<thead> <tr>
-<th>Keyword</th>
-<th>Action</th>
-<th>Argument</th>
-<th>Argument</th>
-<th>Argument</th>
-</tr>
-</thead> <tbody>
-<tr>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
 </tbody>
 </table>
 </body></html>
=======================================
--- /atest/testdata/parsing/html_entityrefs_variables.py Wed Aug 31 12:31:53 2011 +++ /atest/testdata/parsing/html_entityrefs_variables.py Tue Oct 23 14:20:22 2012
@@ -1,3 +1,3 @@
-scandinavian_letters = u'Hyv\u00E4\u00E4 \u00FC\u00F6t\u00E4 \u00C5\u00C4\u00D6'
-xml_escapes = '''& &amp; < <tag> > ' " '" ' &gt;'''
-other_escapes = u'\u00A7xxx\u00A7 \u007E \' "'
+scandinavian_letters = u'Hyv\xE4\xE4 \xFC\xF6t\xE4 \xC5\xC4\xD6'
+xml_escapes = '''& &amp; <tag> ' " &gt;'''
+other_escapes = u'''\xA7xxx \u02DC ~'''
=======================================
--- /src/robot/parsing/htmlreader.py    Tue Oct 23 13:19:14 2012
+++ /src/robot/parsing/htmlreader.py    Tue Oct 23 14:20:22 2012
@@ -14,10 +14,9 @@


 import HTMLParser
-import sys
 from htmlentitydefs import entitydefs

-extra_entitydefs = {'nbsp': ' ',  'apos': "'", 'tilde': '~'}
+
 NON_BREAKING_SPACE = u'\xA0'


@@ -29,16 +28,16 @@
     def __init__(self):
         HTMLParser.HTMLParser.__init__(self)
         self._encoding = 'ISO-8859-1'
-        self._handlers = { 'table_start' : self.table_start,
-                           'table_end'   : self.table_end,
-                           'tr_start'    : self.tr_start,
-                           'tr_end'      : self.tr_end,
-                           'td_start'    : self.td_start,
-                           'td_end'      : self.td_end,
-                           'th_start'    : self.td_start,
-                           'th_end'      : self.td_end,
-                           'br_start'    : self.br_start,
-                           'meta_start'  : self.meta_start }
+        self._handlers = {'table_start' : self.table_start,
+                          'table_end'   : self.table_end,
+                          'tr_start'    : self.tr_start,
+                          'tr_end'      : self.tr_end,
+                          'td_start'    : self.td_start,
+                          'td_end'      : self.td_end,
+                          'th_start'    : self.td_start,
+                          'th_end'      : self.td_end,
+                          'br_start'    : self.br_start,
+                          'meta_start'  : self.meta_start}

     def read(self, htmlfile, populator):
         self.populator = populator
@@ -46,13 +45,16 @@
         self.current_row = None
         self.current_cell = None
         for line in htmlfile.readlines():
-            self.feed(line)
+            self.feed(self._decode(line))
# Calling close is required by the HTMLParser but may cause problems
         # if the same instance of our HtmlParser is reused. Currently it's
         # used only once so there's no problem.
         self.close()
         self.populator.eof()

+    def _decode(self, line):
+        return line.decode(self._encoding)
+
     def handle_starttag(self, tag, attrs):
         handler = self._handlers.get(tag+'_start')
         if handler is not None:
@@ -63,22 +65,20 @@
         if handler is not None:
             handler()

-    def handle_data(self, data, decode=True):
+    def handle_data(self, data):
         if self.state == self.IGNORE or self.current_cell is None:
             return
-        if decode:
-            data = data.decode(self._encoding)
         if NON_BREAKING_SPACE in data:
             data = data.replace(NON_BREAKING_SPACE, ' ')
         self.current_cell.append(data)

     def handle_entityref(self, name):
         value = self._handle_entityref(name)
-        self.handle_data(value, decode=False)
+        self.handle_data(value)

     def _handle_entityref(self, name):
-        if extra_entitydefs.has_key(name):
-            return extra_entitydefs[name]
+        if name == 'apos':  # missing from entitydefs
+            return "'"
         try:
             value = entitydefs[name]
         except KeyError:
@@ -89,12 +89,12 @@

     def handle_charref(self, number):
         value = self._handle_charref(number)
-        self.handle_data(value, decode=False)
+        self.handle_data(value)

     def _handle_charref(self, number):
-        if number.lower().startswith('x'):
+        if number.startswith(('x', 'X')):
+            base = 16
             number = number[1:]
-            base = 16
         else:
             base = 10
         try:
@@ -133,13 +133,8 @@
         if self.current_cell is not None:
             self.td_end()
         if self.state == self.INITIAL:
-            if len(self.current_row) > 0:
-                if self.populator.start_table(self.current_row):
-                    self.state = self.PROCESS
-                else:
-                    self.state = self.IGNORE
-            else:
-                self.state = self.IGNORE
+            accepted = self.populator.start_table(self.current_row)
+            self.state = self.PROCESS if accepted else self.IGNORE
         elif self.state == self.PROCESS:
             self.populator.add(self.current_row)
         self.current_row = None
@@ -158,8 +153,7 @@
         self.current_cell = None

     def br_start(self, attrs=None):
-        if self.current_cell is not None and self.state != self.IGNORE:
-            self.current_cell.append('\n')
+        self.handle_data('\n')

     def meta_start(self, attrs):
         encoding = self._get_encoding_from_meta(attrs)
@@ -178,7 +172,7 @@
                     token = token.strip()
                     if token.lower().startswith('charset='):
                         encoding = token[8:]
-        return valid_http_equiv and encoding or None
+        return encoding if valid_http_equiv else None

     def _get_encoding_from_pi(self, data):
         data = data.strip()
@@ -193,18 +187,3 @@
                     encoding = encoding[1:-1]
                 return encoding
         return None
-
-
-# Workaround for following bug in Python 2.6: http://bugs.python.org/issue3932
-if sys.version_info[:2] > (2, 5):
-    def unescape_from_py25(self, s):
-        if '&' not in s:
-            return s
-        s = s.replace("&lt;", "<")
-        s = s.replace("&gt;", ">")
-        s = s.replace("&apos;", "'")
-        s = s.replace("&quot;", '"')
-        s = s.replace("&amp;", "&") # Must be last
-        return s
-
-    HTMLParser.HTMLParser.unescape = unescape_from_py25

Reply via email to