Author: Ronan Lamy <[email protected]>
Branch: PEP393
Changeset: r89893:92709629b937
Date: 2017-02-01 16:33 +0000
http://bitbucket.org/pypy/pypy/changeset/92709629b937/
Log: Copy a bunch of macros from the CPython header
diff --git a/pypy/module/cpyext/include/unicodeobject.h
b/pypy/module/cpyext/include/unicodeobject.h
--- a/pypy/module/cpyext/include/unicodeobject.h
+++ b/pypy/module/cpyext/include/unicodeobject.h
@@ -7,6 +7,30 @@
#include <cpyext_unicodeobject.h>
+/* Fast access macros */
+#ifndef Py_LIMITED_API
+
+#define PyUnicode_WSTR_LENGTH(op) \
+ (PyUnicode_IS_COMPACT_ASCII(op) ? \
+ ((PyASCIIObject*)op)->length : \
+ ((PyCompactUnicodeObject*)op)->wstr_length)
+
+/* Returns the deprecated Py_UNICODE representation's size in code units
+ (this includes surrogate pairs as 2 units).
+ If the Py_UNICODE representation is not available, it will be computed
+ on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
+
+#define PyUnicode_GET_SIZE(op) \
+ (assert(PyUnicode_Check(op)), \
+ (((PyASCIIObject *)(op))->wstr) ? \
+ PyUnicode_WSTR_LENGTH(op) : \
+ ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
+ assert(((PyASCIIObject *)(op))->wstr), \
+ PyUnicode_WSTR_LENGTH(op)))
+
+#define PyUnicode_GET_DATA_SIZE(op) \
+ (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
+
/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
representation on demand. Using this macro is very inefficient now,
try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
@@ -21,12 +45,280 @@
((const char *)(PyUnicode_AS_UNICODE(op)))
-PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char *format, va_list
vargs);
-PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char *format, ...);
+/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
+/* Values for PyASCIIObject.state: */
+
+/* Interning state. */
+#define SSTATE_NOT_INTERNED 0
+#define SSTATE_INTERNED_MORTAL 1
+#define SSTATE_INTERNED_IMMORTAL 2
+
+/* Return true if the string contains only ASCII characters, or 0 if not. The
+ string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
+ ready. */
+#define PyUnicode_IS_ASCII(op) \
+ (assert(PyUnicode_Check(op)), \
+ assert(PyUnicode_IS_READY(op)), \
+ ((PyASCIIObject*)op)->state.ascii)
+
+/* Return true if the string is compact or 0 if not.
+ No type checks or Ready calls are performed. */
+#define PyUnicode_IS_COMPACT(op) \
+ (((PyASCIIObject*)(op))->state.compact)
+
+/* Return true if the string is a compact ASCII string (use PyASCIIObject
+ structure), or 0 if not. No type checks or Ready calls are performed. */
+#define PyUnicode_IS_COMPACT_ASCII(op) \
+ (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
+
+enum PyUnicode_Kind {
+/* String contains only wstr byte characters. This is only possible
+ when the string was created with a legacy API and _PyUnicode_Ready()
+ has not been called yet. */
+ PyUnicode_WCHAR_KIND = 0,
+/* Return values of the PyUnicode_KIND() macro: */
+ PyUnicode_1BYTE_KIND = 1,
+ PyUnicode_2BYTE_KIND = 2,
+ PyUnicode_4BYTE_KIND = 4
+};
+
+/* Return pointers to the canonical representation cast to unsigned char,
+ Py_UCS2, or Py_UCS4 for direct character access.
+ No checks are performed, use PyUnicode_KIND() before to ensure
+ these will work correctly. */
+
+#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
+#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
+#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
+
+/* Return one of the PyUnicode_*_KIND values defined above. */
+#define PyUnicode_KIND(op) \
+ (assert(PyUnicode_Check(op)), \
+ assert(PyUnicode_IS_READY(op)), \
+ ((PyASCIIObject *)(op))->state.kind)
+
+/* Return a void pointer to the raw unicode buffer. */
+#define _PyUnicode_COMPACT_DATA(op) \
+ (PyUnicode_IS_ASCII(op) ? \
+ ((void*)((PyASCIIObject*)(op) + 1)) : \
+ ((void*)((PyCompactUnicodeObject*)(op) + 1)))
+
+#define _PyUnicode_NONCOMPACT_DATA(op) \
+ (assert(((PyUnicodeObject*)(op))->data), \
+ ((((PyUnicodeObject *)(op))->data)))
+
+#define PyUnicode_DATA(op) \
+ (assert(PyUnicode_Check(op)), \
+ PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
+ _PyUnicode_NONCOMPACT_DATA(op))
+
+/* In the access macros below, "kind" may be evaluated more than once.
+ All other macro parameters are evaluated exactly once, so it is safe
+ to put side effects into them (such as increasing the index). */
+
+/* Write into the canonical representation, this macro does not do any sanity
+ checks and is intended for usage in loops. The caller should cache the
+ kind and data pointers obtained from other macro calls.
+ index is the index in the string (starts at 0) and value is the new
+ code point value which should be written to that location. */
+#define PyUnicode_WRITE(kind, data, index, value) \
+ do { \
+ switch ((kind)) { \
+ case PyUnicode_1BYTE_KIND: { \
+ ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
+ break; \
+ } \
+ case PyUnicode_2BYTE_KIND: { \
+ ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
+ break; \
+ } \
+ default: { \
+ assert((kind) == PyUnicode_4BYTE_KIND); \
+ ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
+ } \
+ } \
+ } while (0)
+
+/* Read a code point from the string's canonical representation. No checks
+ or ready calls are performed. */
+#define PyUnicode_READ(kind, data, index) \
+ ((Py_UCS4) \
+ ((kind) == PyUnicode_1BYTE_KIND ? \
+ ((const Py_UCS1 *)(data))[(index)] : \
+ ((kind) == PyUnicode_2BYTE_KIND ? \
+ ((const Py_UCS2 *)(data))[(index)] : \
+ ((const Py_UCS4 *)(data))[(index)] \
+ ) \
+ ))
+
+/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
+ calls PyUnicode_KIND() and might call it twice. For single reads, use
+ PyUnicode_READ_CHAR, for multiple consecutive reads callers should
+ cache kind and use PyUnicode_READ instead. */
+#define PyUnicode_READ_CHAR(unicode, index) \
+ (assert(PyUnicode_Check(unicode)), \
+ assert(PyUnicode_IS_READY(unicode)), \
+ (Py_UCS4) \
+ (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
+ ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
+ (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
+ ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
+ ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
+ ) \
+ ))
+
+/* Returns the length of the unicode string. The caller has to make sure that
+ the string has it's canonical representation set before calling
+ this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
+#define PyUnicode_GET_LENGTH(op) \
+ (assert(PyUnicode_Check(op)), \
+ assert(PyUnicode_IS_READY(op)), \
+ ((PyASCIIObject *)(op))->length)
+
+
+/* Fast check to determine whether an object is ready. Equivalent to
+ PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
+
+#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
+
+/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
+ case. If the canonical representation is not yet set, it will still call
+ _PyUnicode_Ready().
+ Returns 0 on success and -1 on errors. */
+#define PyUnicode_READY(op) \
+ (assert(PyUnicode_Check(op)), \
+ (PyUnicode_IS_READY(op) ? \
+ 0 : _PyUnicode_Ready((PyObject *)(op))))
+
+/* Return a maximum character value which is suitable for creating another
+ string based on op. This is always an approximation but more efficient
+ than iterating over the string. */
+#define PyUnicode_MAX_CHAR_VALUE(op) \
+ (assert(PyUnicode_IS_READY(op)), \
+ (PyUnicode_IS_ASCII(op) ? \
+ (0x7f) : \
+ (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
+ (0xffU) : \
+ (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
+ (0xffffU) : \
+ (0x10ffffU)))))
+
+#endif
+
+/* --- Constants ---------------------------------------------------------- */
+
+/* This Unicode character will be used as replacement character during
+ decoding if the errors argument is set to "replace". Note: the
+ Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
+ Unicode 3.0. */
+
+#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
+
+/* === Public API ========================================================= */
+
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
+ const char *format, /* ASCII-encoded string */
+ va_list vargs
+ );
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
+ const char *format, /* ASCII-encoded string */
+ ...
+ );
+
+/* Use only if you know it's a string */
+#define PyUnicode_CHECK_INTERNED(op) \
+ (((PyASCIIObject *)(op))->state.interned)
+
+/* --- wchar_t support for platforms which support it --------------------- */
+
+#ifdef HAVE_WCHAR_H
+
+/* Convert the Unicode object to a wide character string. The output string
+ always ends with a nul character. If size is not NULL, write the number of
+ wide characters (excluding the null character) into *size.
+
+ Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
+ on success. On error, returns NULL, *size is undefined and raises a
+ MemoryError. */
+
+PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
+ PyObject *unicode, /* Unicode object */
+ Py_ssize_t *size /* number of characters of the result */
+ );
+
+#endif
+
+/* === Builtin Codecs =====================================================
+
+ Many of these APIs take two arguments encoding and errors. These
+ parameters encoding and errors have the same semantics as the ones
+ of the builtin str() API.
+
+ Setting encoding to NULL causes the default encoding (UTF-8) to be used.
+
+ Error handling is set by errors which may also be set to NULL
+ meaning to use the default handling defined for the codec. Default
+ error handling for all builtin codecs is "strict" (ValueErrors are
+ raised).
+
+ The codecs all use a similar interface. Only deviation from the
+ generic ones are documented.
+
+*/
+
+/* --- Manage the default encoding ---------------------------------------- */
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+ Unicode object unicode and the size of the encoded representation
+ in bytes stored in *size.
+
+ In case of an error, no *size is set.
+
+ This function caches the UTF-8 encoded string in the unicodeobject
+ and subsequent calls will return the same string. The memory is released
+ when the unicodeobject is deallocated.
+
+ _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
+ support the previous internal function with the same behaviour.
+
+ *** This API is for interpreter INTERNAL USE ONLY and will likely
+ *** be removed or changed in the future.
+
+ *** If you need to access the Unicode object as UTF-8 bytes string,
+ *** please use PyUnicode_AsUTF8String() instead.
+*/
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
+ PyObject *unicode,
+ Py_ssize_t *size);
+#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
+#endif
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+ Unicode object unicode.
+
+ Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
+ in the unicodeobject.
+
+ _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
+ support the previous internal function with the same behaviour.
+
+ Use of this API is DEPRECATED since no size information can be
+ extracted from the returned data.
+
+ *** This API is for interpreter INTERNAL USE ONLY and will likely
+ *** be removed or changed for Python 3.1.
+
+ *** If you need to access the Unicode object as UTF-8 bytes string,
+ *** please use PyUnicode_AsUTF8String() instead.
+
+*/
+
+#ifndef Py_LIMITED_API
#define _PyUnicode_AsString PyUnicode_AsUTF8
-
-PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(PyObject *unicode, Py_ssize_t
*size);
+#endif
Py_LOCAL_INLINE(size_t) Py_UNICODE_strlen(const Py_UNICODE *u)
{
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -115,6 +115,9 @@
def set_ascii(py_obj, value):
get_state(py_obj).c_ascii = cts.cast('unsigned int', value)
+def set_ready(py_obj, value):
+ get_state(py_obj).c_ready = cts.cast('unsigned int', value)
+
def get_wbuffer(py_obj):
py_obj = cts.cast('PyASCIIObject*', py_obj)
return py_obj.c_wstr
@@ -253,31 +256,6 @@
"""Get the maximum ordinal for a Unicode character."""
return runicode.UNICHR(runicode.MAXUNICODE)
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_DATA_SIZE(space, w_obj):
- """Return the size of the object's internal buffer in bytes. o has to be a
- PyUnicodeObject (not checked)."""
- return rffi.sizeof(Py_UNICODE) * PyUnicode_GET_SIZE(space, w_obj)
-
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_SIZE(space, w_obj):
- """Return the size of the object. obj is a PyUnicodeObject (not
- checked)."""
- return space.len_w(w_obj)
-
-@cpython_api([PyObject], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_LENGTH(space, w_obj):
- """Return the length of the Unicode string, in code points.
- o has to be a Unicode object in the "canonical" representation
- (not checked)."""
- assert isinstance(w_obj, unicodeobject.W_UnicodeObject)
- return space.len_w(w_obj)
-
-@cpython_api([PyObject], rffi.INT, error=CANNOT_FAIL)
-def PyUnicode_IS_READY(space, w_obj):
- # PyPy is always ready.
- return space.w_True
-
@cts.decl("int _PyUnicode_Ready(PyObject *unicode)", error=-1)
def _PyUnicode_Ready(space, w_obj):
assert isinstance(w_obj, unicodeobject.W_UnicodeObject)
@@ -304,6 +282,7 @@
set_ascii(py_obj, 0)
set_utf8(py_obj, 0)
set_utf8_len(py_obj, 0)
+ set_ready(py_obj, 1)
elif maxchar < 65536:
# XXX: assumes that sizeof(wchar_t) == 4
ucs2_str = unicode_encode_utf_16(
@@ -314,6 +293,7 @@
set_kind(py_obj, _2BYTE_KIND)
set_utf8(py_obj, 0)
set_utf8_len(py_obj, 0)
+ set_ready(py_obj, 1)
else:
# XXX: assumes that sizeof(wchar_t) == 4
ucs4_data = get_wbuffer(py_obj)
@@ -322,6 +302,7 @@
set_kind(py_obj, _4BYTE_KIND)
set_utf8(py_obj, 0)
set_utf8_len(py_obj, 0)
+ set_ready(py_obj, 1)
@cpython_api([PyObject], rffi.CWCHARP)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit