Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-06-15 Thread Julian Andres Klode
On Wed, Jun 13, 2012 at 06:35:19PM +0100, Colin Watson wrote:
> On Sat, Jan 21, 2012 at 03:10:54AM +, Colin Watson wrote:
> > (That said, I haven't entirely got the python-debian test suite to pass
> > yet.  It's parsing mixed-encoding files in terrifying ways which are
> > going to require some rearrangement to work well with Python 3.  So,
> > strictly, I can't claim to be 100% confident in this change yet; but I'd
> > appreciate knowing whether the general approach is OK with you.)
> 
> This is no longer a concern (see #625509), and I would really appreciate
> feedback from python-apt maintainers on this; there's a substantial
> stack of other Python 3 porting work that depends on getting a
> python3-debian into the archive, and that depends on this patch ...
> 
> Here's an updated version which applies to HEAD.  Please note that this
> also fixes what I consider a bug whereby apt_pkg.TagFile() accepts byte
> strings but not Unicode strings in Python 2.

Committed, with documentation in doc/source/library/apt_pkg.rst added.

-- 
Julian Andres Klode  - Debian Developer, Ubuntu Member

See http://wiki.debian.org/JulianAndresKlode and http://jak-linux.org/.



-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org



Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-06-13 Thread Colin Watson
On Sat, Jan 21, 2012 at 03:10:54AM +, Colin Watson wrote:
> (That said, I haven't entirely got the python-debian test suite to pass
> yet.  It's parsing mixed-encoding files in terrifying ways which are
> going to require some rearrangement to work well with Python 3.  So,
> strictly, I can't claim to be 100% confident in this change yet; but I'd
> appreciate knowing whether the general approach is OK with you.)

This is no longer a concern (see #625509), and I would really appreciate
feedback from python-apt maintainers on this; there's a substantial
stack of other Python 3 porting work that depends on getting a
python3-debian into the archive, and that depends on this patch ...

Here's an updated version which applies to HEAD.  Please note that this
also fixes what I consider a bug whereby apt_pkg.TagFile() accepts byte
strings but not Unicode strings in Python 2.

=== modified file 'python/tag.cc'
--- python/tag.cc   2012-02-06 13:55:25 +
+++ python/tag.cc   2012-06-13 15:29:08 +
@@ -38,6 +38,10 @@ using namespace std;
 struct TagSecData : public CppPyObject
 {
char *Data;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // The owner of the TagFile is a Python file object.
@@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject<
 {
TagSecData *Section;
FileFd Fd;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // Traversal and Clean for owned objects
@@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) {
 return 0;
 }
 
+// Helpers to return Unicode or bytes as appropriate.
+#if PY_MAJOR_VERSION < 3
+#define TagSecString_FromStringAndSize(self, v, len) \
+PyString_FromStringAndSize((v), (len))
+#define TagSecString_FromString(self, v) PyString_FromString(v)
+#else
+PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v,
+Py_ssize_t len) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromStringAndSize(v, len);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromStringAndSize(v, len);
+}
+
+PyObject *TagSecString_FromString(PyObject *self, const char *v) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromString(v);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, strlen(v),
+ PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromString(v);
+}
+#endif
+
 
/*}}}*/
 // TagSecFree - Free a Tag Section /*{{{*/
@@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindRaw =
@@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
 
const char *Start;
const char *Stop;
GetCpp(Self).Get(Start,Stop,Pos);
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindFlag =
@@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject
 // Map access, operator []
 static PyObject *TagSecMap(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   {
-  PyErr_SetNone(PyExc_TypeError);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
   return 0;
-   }
-
const char *Start;
const char *Stop;
-   if (GetCpp(Self).Find(PyString_AsString(Arg),Start,Stop) == 
false)
+   if (GetCpp(Self).Find(Name,Start,Stop) == false)
{
-  PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg));
+  PyErr_SetString(PyExc_KeyError,Name);
   return 0;
}
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 // len() operation
@@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject *
 
 static int TagSecContains(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   return 0;
-   const char *Name = PyString_AsString(Arg);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
+  return 0;
const char *Start;
const char *Stop;
if (GetCpp(Self).Find(Name,Start,Stop) == false)
@@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel
const char *Start;
const char *Stop;
GetCpp(Self).GetSection(Start,Stop);
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return Ta

Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-01-20 Thread Colin Watson
On Fri, Jan 20, 2012 at 03:57:45PM +, Colin Watson wrote:
> OK.  How about something like this?  I added both an explicit bytes=
> parameter and a fallback which tries to detect the encoding from the
> file object.

This crashed in the python-debian test suite due to a silly mistake.
Here's a fixed version.

(That said, I haven't entirely got the python-debian test suite to pass
yet.  It's parsing mixed-encoding files in terrifying ways which are
going to require some rearrangement to work well with Python 3.  So,
strictly, I can't claim to be 100% confident in this change yet; but I'd
appreciate knowing whether the general approach is OK with you.)

=== modified file 'python/tag.cc'
--- python/tag.cc   2011-11-10 16:20:58 +
+++ python/tag.cc   2012-01-20 17:12:36 +
@@ -38,6 +38,10 @@ using namespace std;
 struct TagSecData : public CppPyObject
 {
char *Data;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // The owner of the TagFile is a Python file object.
@@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject<
 {
TagSecData *Section;
FileFd Fd;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // Traversal and Clean for owned objects
@@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) {
 return 0;
 }
 
+// Helpers to return Unicode or bytes as appropriate.
+#if PY_MAJOR_VERSION < 3
+#define TagSecString_FromStringAndSize(self, v, len) \
+PyString_FromStringAndSize((v), (len))
+#define TagSecString_FromString(self, v) PyString_FromString(v)
+#else
+PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v,
+Py_ssize_t len) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromStringAndSize(v, len);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromStringAndSize(v, len);
+}
+
+PyObject *TagSecString_FromString(PyObject *self, const char *v) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromString(v);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, strlen(v),
+ PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromString(v);
+}
+#endif
+
 
/*}}}*/
 // TagSecFree - Free a Tag Section /*{{{*/
@@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindRaw =
@@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
 
const char *Start;
const char *Stop;
GetCpp(Self).Get(Start,Stop,Pos);
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindFlag =
@@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject
 // Map access, operator []
 static PyObject *TagSecMap(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   {
-  PyErr_SetNone(PyExc_TypeError);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
   return 0;
-   }
-
const char *Start;
const char *Stop;
-   if (GetCpp(Self).Find(PyString_AsString(Arg),Start,Stop) == 
false)
+   if (GetCpp(Self).Find(Name,Start,Stop) == false)
{
-  PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg));
+  PyErr_SetString(PyExc_KeyError,Name);
   return 0;
}
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 // len() operation
@@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject *
 
 static int TagSecContains(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   return 0;
-   const char *Name = PyString_AsString(Arg);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
+  return 0;
const char *Start;
const char *Stop;
if (GetCpp(Self).Find(Name,Start,Stop) == false)
@@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel
const char *Start;
const char *Stop;
GetCpp(Self).GetSection(Start,Stop);
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
/*}}}*/
 // TagFile Wrappers/*{{{*/
@@ -2

Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-01-20 Thread Colin Watson
On Wed, Jan 18, 2012 at 12:14:02PM +0100, Julian Andres Klode wrote:
> You'd also need to take care of TagSection if that is done, which should
> then work in bytes mode when passed a bytes string.
> 
> Basically you'd need to modify TagSection and TagFile to both store whether
> to use bytes or unicode and pass the value of that flag from the TagFile
> to the TagSection. Then create a function
> 
>   PyObject *TagFile_ToString(char *s, size_t n)
> 
> or similar that uses PyString_* functions or PyBytes_ functions depending
> on the context (where PyString is mapped to unicode in Python 3, and
> str in Python 2). Then use that function everywhere we currently
> create strings in the TagFile.

OK.  How about something like this?  I added both an explicit bytes=
parameter and a fallback which tries to detect the encoding from the
file object.

=== modified file 'python/tag.cc'
--- python/tag.cc   2011-11-10 16:20:58 +
+++ python/tag.cc   2012-01-20 14:47:56 +
@@ -38,6 +38,10 @@ using namespace std;
 struct TagSecData : public CppPyObject
 {
char *Data;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // The owner of the TagFile is a Python file object.
@@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject<
 {
TagSecData *Section;
FileFd Fd;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // Traversal and Clean for owned objects
@@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) {
 return 0;
 }
 
+// Helpers to return Unicode or bytes as appropriate.
+#if PY_MAJOR_VERSION < 3
+#define TagSecString_FromStringAndSize(self, v, len) \
+PyString_FromStringAndSize((v), (len))
+#define TagSecString_FromString(self, v) PyString_FromString(v)
+#else
+PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v,
+Py_ssize_t len) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromStringAndSize(v, len);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromStringAndSize(v, len);
+}
+
+PyObject *TagSecString_FromString(PyObject *self, const char *v) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+  return PyBytes_FromString(v);
+   else if (Self->Encoding)
+  return PyUnicode_Decode(v, strlen(v),
+ PyUnicode_AsString(Self->Encoding), 0);
+   else
+  return PyUnicode_FromString(v);
+}
+#endif
+
 
/*}}}*/
 // TagSecFree - Free a Tag Section /*{{{*/
@@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindRaw =
@@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject
{
   if (Default == 0)
 Py_RETURN_NONE;
-  return PyString_FromString(Default);
+  return TagSecString_FromString(Self,Default);
}
 
const char *Start;
const char *Stop;
GetCpp(Self).Get(Start,Stop,Pos);
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindFlag =
@@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject
 // Map access, operator []
 static PyObject *TagSecMap(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   {
-  PyErr_SetNone(PyExc_TypeError);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
   return 0;
-   }
-
const char *Start;
const char *Stop;
-   if (GetCpp(Self).Find(PyString_AsString(Arg),Start,Stop) == 
false)
+   if (GetCpp(Self).Find(Name,Start,Stop) == false)
{
-  PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg));
+  PyErr_SetString(PyExc_KeyError,Name);
   return 0;
}
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 // len() operation
@@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject *
 
 static int TagSecContains(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   return 0;
-   const char *Name = PyString_AsString(Arg);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
+  return 0;
const char *Start;
const char *Stop;
if (GetCpp(Self).Find(Name,Start,Stop) == false)
@@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel
const char *Start;
const char *Stop;
GetCpp(Self).GetSection(Start,Stop);
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSi

Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-01-18 Thread Julian Andres Klode
On Wed, Jan 18, 2012 at 10:02:31AM +, Colin Watson wrote:
> On Wed, Jan 18, 2012 at 12:56:03AM +, Colin Watson wrote:
> > python-debian's test suite also tests that it's possible to parse old
> > Sources files in *mixed* encodings.  This is going to be harder because
> > it basically means having apt_pkg.TagSection return bytes, which I don't
> > think is desirable in general.  Maybe this could be optional somehow?
> 
> Thinking about it, this seems a reasonable thing to make switchable in
> TagFile's constructor.  After all:
> 
>   >>> with open("test", encoding="iso-8859-1") as test:
>   ... print(test.read().__class__)
>   ...
>   
>   >>> with open("test", mode="rb") as test:
>   ... print(test.read().__class__)
>   ...
>   
> 
> So there's clear precedent in the language for the same method returning
> str or bytes depending on how the class was constructed.  Maybe a bytes=
> keyword argument?

You'd also need to take care of TagSection if that is done, which should
then work in bytes mode when passed a bytes string.

Basically you'd need to modify TagSection and TagFile to both store whether
to use bytes or unicode and pass the value of that flag from the TagFile
to the TagSection. Then create a function

PyObject *TagFile_ToString(char *s, size_t n)

or similar that uses PyString_* functions or PyBytes_ functions depending
on the context (where PyString is mapped to unicode in Python 3, and
str in Python 2). Then use that function everywhere we currently
create strings in the TagFile.


-- 
Julian Andres Klode  - Debian Developer, Ubuntu Member

See http://wiki.debian.org/JulianAndresKlode and http://jak-linux.org/.



-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org



Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-01-18 Thread Colin Watson
On Wed, Jan 18, 2012 at 12:56:03AM +, Colin Watson wrote:
> python-debian's test suite also tests that it's possible to parse old
> Sources files in *mixed* encodings.  This is going to be harder because
> it basically means having apt_pkg.TagSection return bytes, which I don't
> think is desirable in general.  Maybe this could be optional somehow?

Thinking about it, this seems a reasonable thing to make switchable in
TagFile's constructor.  After all:

  >>> with open("test", encoding="iso-8859-1") as test:
  ... print(test.read().__class__)
  ...
  
  >>> with open("test", mode="rb") as test:
  ... print(test.read().__class__)
  ...
  

So there's clear precedent in the language for the same method returning
str or bytes depending on how the class was constructed.  Maybe a bytes=
keyword argument?

-- 
Colin Watson   [cjwat...@debian.org]



-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org



Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles

2012-01-17 Thread Colin Watson
Package: python3-apt
Version: 0.8.3
Severity: normal

In Python 3, I can find no way to get apt_pkg.TagFile to read a file
that isn't encoded in UTF-8:

  >>> import sys
  >>> import apt_pkg
  >>> sys.version
  '3.2.2+ (default, Jan  8 2012, 07:26:18) \n[GCC 4.6.2]'
  >>> with open("test", "w", encoding="iso-8859-1") as test:
  ... print("Package: test", file=test)
  ... print("Maintainer: M\xe4intainer ", file=test)
  ... print(file=test)
  ...
  >>> tagfile = apt_pkg.TagFile(open("test", "rb"))
  >>> next(tagfile)["Maintainer"]
  Traceback (most recent call last):
File "", line 1, in 
  UnicodeDecodeError: 'utf8' codec can't decode byte 0xe4 in position 1: 
invalid continuation byte
  >>> tagfile = apt_pkg.TagFile(open("test", encoding="iso-8859-1"))
  >>> next(tagfile)["Maintainer"]
  Traceback (most recent call last):
File "", line 1, in 
  UnicodeDecodeError: 'utf8' codec can't decode byte 0xe4 in position 1: 
invalid continuation byte

Whereas in Python 2:

  >>> import sys
  >>> import apt_pkg
  >>> sys.version
  '2.7.2+ (default, Jan 13 2012, 23:15:17) \n[GCC 4.6.2]'
  >>> tagfile = apt_pkg.TagFile(open("test", "rb"))
  >>> tagfile.next()["Maintainer"]
  'M\xe4intainer '

This breaks part of the python-debian test suite (I'm currently trying
to port python-debian to Python 3), which is interested in such things
as making sure that it's possible to parse old Sources files from before
Debian switched to UTF-8.

A fix is tricky.  We can't do anything actually nice using Python 3's
I/O facilities, because python-apt just pokes around to find the file
descriptor and passes that directly to apt.  However, one idea that
comes to mind is that if you open a file with the 'encoding' parameter
then python-apt could spot that in the file object, remember it, and
decode bytes using that encoding any time it wants to return a Unicode
string.

python-debian's test suite also tests that it's possible to parse old
Sources files in *mixed* encodings.  This is going to be harder because
it basically means having apt_pkg.TagSection return bytes, which I don't
think is desirable in general.  Maybe this could be optional somehow?

Thanks,

-- 
Colin Watson   [cjwat...@debian.org]



-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org