pollita Tue Dec 5 04:13:47 2006 UTC
Modified files:
/php-src/ext/standard file.c file.h
/php-src/ext/standard/tests/file bug12556.phpt fgetcsv.phpt
Log:
Unicode upgrade for fgetcsv()
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/file.c?r1=1.469&r2=1.470&diff_format=u
Index: php-src/ext/standard/file.c
diff -u php-src/ext/standard/file.c:1.469 php-src/ext/standard/file.c:1.470
--- php-src/ext/standard/file.c:1.469 Wed Nov 22 12:56:26 2006
+++ php-src/ext/standard/file.c Tue Dec 5 04:13:46 2006
@@ -21,7 +21,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: file.c,v 1.469 2006/11/22 12:56:26 pajoye Exp $ */
+/* $Id: file.c,v 1.470 2006/12/05 04:13:46 pollita Exp $ */
/* Synced with php 3.0 revision 1.218 1999-06-16 [ssb] */
@@ -1932,43 +1932,6 @@
}
/* }}} */
-static const char *php_fgetcsv_lookup_trailing_spaces(const char *ptr, size_t
len, const char delimiter TSRMLS_DC)
-{
- int inc_len;
- unsigned char last_chars[2] = { 0, 0 };
-
- while (len > 0) {
- inc_len = (*ptr == '\0' ? 1: php_mblen(ptr, len));
- switch (inc_len) {
- case -2:
- case -1:
- inc_len = 1;
- php_mblen(NULL, 0);
- break;
- case 0:
- goto quit_loop;
- case 1:
- default:
- last_chars[0] = last_chars[1];
- last_chars[1] = *ptr;
- break;
- }
- ptr += inc_len;
- len -= inc_len;
- }
-quit_loop:
- switch (last_chars[1]) {
- case '\n':
- if (last_chars[0] == '\r') {
- return ptr - 2;
- }
- /* break is omitted intentionally */
- case '\r':
- return ptr - 1;
- }
- return ptr;
-}
-
#define FPUTCSV_FLD_CHK(c) memchr(Z_STRVAL_PP(field), c, Z_STRLEN_PP(field))
/* {{{ proto int fputcsv(resource fp, array fields [, string delimiter [,
string enclosure]])
@@ -2072,87 +2035,149 @@
}
/* }}} */
-/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [,
string enclosure]]])
+/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [,
string enclosure[, string escape]]]]) U
Get line from file pointer and parse for CSV fields */
-/* UTODO: Accept unicode contents */
+#define PHP_FGETCSV_TRUNCATE(field) \
+if (argc > 4) { \
+ /* Caller knows about new semantics since they're using new param,
allow multichar */ \
+} else if (field##_type == IS_STRING && field##_len > 1) { \
+ php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single
character"); \
+ delimiter_len = 1; \
+} else if (field##_type == IS_UNICODE && u_countChar32((UChar*)field,
field##_len) > 1) { \
+ int __tmp = 0; \
+ php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single
character"); \
+ U16_FWD_1(((UChar*)field), __tmp, field##_len); \
+ field##_len = __tmp; \
+}
+
PHP_FUNCTION(fgetcsv)
{
- char delimiter = ','; /* allow this to be set as parameter */
- char enclosure = '"'; /* allow this to be set as parameter */
- /* first section exactly as php_fgetss */
-
- long len = 0;
- size_t buf_len;
- char *buf;
+ zend_uchar delimiter_type = IS_STRING, enclosure_type = IS_STRING,
escape_type = IS_STRING;
+ char *delimiter = ",", *enclosure = "\"", *escape = "\\";
+ int delimiter_len = 1, enclosure_len = 1, escape_len = 1;
+ long len = -1;
+ zstr buf;
+ int buf_len, argc = ZEND_NUM_ARGS();
php_stream *stream;
+ zval *zstream;
+ zend_uchar delimiter_free = 0, enclosure_free = 0, escape_free = 0;
- {
- zval *fd, **len_zv = NULL;
- char *delimiter_str = NULL;
- int delimiter_str_len = 0;
- char *enclosure_str = NULL;
- int enclosure_str_len = 0;
-
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r|Zss",
- &fd, &len_zv, &delimiter_str,
&delimiter_str_len,
- &enclosure_str, &enclosure_str_len) ==
FAILURE) {
- return;
- }
+ if (zend_parse_parameters(argc TSRMLS_CC, "r|l!ttt", &zstream, &len,
+ &delimiter, &delimiter_len,
&delimiter_type,
+ &enclosure, &enclosure_len,
&enclosure_type,
+ &escape, &escape_len,
&escape_type) == FAILURE) {
+ return;
+ }
- if (delimiter_str != NULL) {
- /* Make sure that there is at least one character in
string */
- if (delimiter_str_len < 1) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING,
"delimiter must be a character");
- RETURN_FALSE;
- } else if (delimiter_str_len > 1) {
- php_error_docref(NULL TSRMLS_CC, E_NOTICE,
"delimiter must be a single character");
- }
+ PHP_STREAM_TO_ZVAL(stream, &zstream);
+
+ /* Make sure that there is at least one character in string,
+ * For userspace BC purposes we generally limit delimiters and
enclosures to 1 character,
+ * though the code now supports multiple characters
+ *
+ * If this function is called with all five parameters however,
+ * then multiple characters are allowed for all subarguments */
+ if (delimiter_len < 1) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "delimiter must be
a character");
+ RETURN_FALSE;
+ } else PHP_FGETCSV_TRUNCATE(delimiter);
+
+ if (enclosure_len < 1) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "enclosure must be
a character");
+ RETURN_FALSE;
+ } else PHP_FGETCSV_TRUNCATE(enclosure);
+
+ if (escape_len < 1) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "escape must be a
character");
+ RETURN_FALSE;
+ }
+
+ if (len < -1) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "Length parameter
may not be negative");
+ RETURN_FALSE;
+ } else if (len == 0) {
+ len = -1;
+ }
- /* use first character from string */
- delimiter = delimiter_str[0];
+ if (stream->readbuf_type == IS_STRING) {
+ /* Binary mode stream needs binary delmiter/enclosure */
+ if (delimiter_type == IS_UNICODE) {
+ if (FAILURE ==
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &delimiter,
&delimiter_len, (UChar*)delimiter, delimiter_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting delimiter from unicode");
+ RETVAL_FALSE;
+ goto cleanup;
+ }
+ delimiter_free = 1;
+ }
+ if (enclosure_type == IS_UNICODE) {
+ if (FAILURE ==
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &enclosure,
&enclosure_len, (UChar*)enclosure, enclosure_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting enclosure from unicode");
+ RETVAL_FALSE;
+ goto cleanup;
+ }
+ enclosure_free = 1;
+ }
+ if (escape_type == IS_UNICODE) {
+ if (FAILURE ==
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &escape,
&escape_len, (UChar*)escape, escape_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting escape from unicode");
+ RETVAL_FALSE;
+ goto cleanup;
+ }
+ escape_free = 1;
}
-
- if (enclosure_str != NULL) {
- if (enclosure_str_len < 1) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING,
"enclosure must be a character");
- RETURN_FALSE;
- } else if (enclosure_str_len > 1) {
- php_error_docref(NULL TSRMLS_CC, E_NOTICE,
"enclosure must be a single character");
+ } else {
+ /* Unicode mode stream needs unicode delimiter/enclosure */
+ if (delimiter_type == IS_STRING) {
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&delimiter, &delimiter_len, delimiter, delimiter_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting delimiter to unicode");
+ RETVAL_FALSE;
+ goto cleanup;
}
-
- /* use first character from string */
- enclosure = enclosure_str[0];
+ delimiter_free = 1;
}
-
- if (len_zv != NULL && Z_TYPE_PP(len_zv) != IS_NULL) {
- convert_to_long_ex(len_zv);
- len = Z_LVAL_PP(len_zv);
- if (len < 0) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Length parameter may not be negative");
- RETURN_FALSE;
- } else if (len == 0) {
- len = -1;
+ if (enclosure_type == IS_STRING) {
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&enclosure, &enclosure_len, enclosure, enclosure_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting enclosure to unicode");
+ RETVAL_FALSE;
+ goto cleanup;
}
- } else {
- len = -1;
+ enclosure_free = 1;
+ }
+ if (escape_type == IS_STRING) {
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&escape, &escape_len, escape, escape_len TSRMLS_CC)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING,
"Failed converting escape to unicode");
+ RETVAL_FALSE;
+ goto cleanup;
+ }
+ escape_free = 1;
}
+ }
- PHP_STREAM_TO_ZVAL(stream, &fd);
+ buf.v = php_stream_get_line_ex(stream, stream->readbuf_type, NULL_ZSTR,
0, len, &buf_len);
+ if (!buf.v) {
+ /* No data */
+ RETVAL_FALSE;
+ goto cleanup;
}
- if (len < 0) {
- if ((buf = php_stream_get_line(stream, NULL_ZSTR, 0, &buf_len))
== NULL) {
- RETURN_FALSE;
- }
+ if (stream->readbuf_type == IS_UNICODE) {
+ /* Unicode mode */
+ php_u_fgetcsv(stream, (UChar*)delimiter, delimiter_len,
(UChar*)enclosure, enclosure_len, (UChar*)escape, escape_len, buf.u, buf_len,
return_value TSRMLS_CC);
} else {
- buf = emalloc(len + 1);
- if (php_stream_get_line(stream, ZSTR(buf), len + 1, &buf_len)
== NULL) {
- efree(buf);
- RETURN_FALSE;
- }
+ /* Binary mode */
+ php_fgetcsv_ex(stream, delimiter, delimiter_len, enclosure,
enclosure_len, escape, escape_len, buf.s, buf_len, return_value TSRMLS_CC);
}
- php_fgetcsv(stream, delimiter, enclosure, buf_len, buf, return_value
TSRMLS_CC);
+cleanup:
+ if (delimiter_free) {
+ efree(delimiter);
+ }
+ if (enclosure_free) {
+ efree(enclosure);
+ }
+ if (escape_free) {
+ efree(escape);
+ }
}
/* }}} */
@@ -2161,266 +2186,442 @@
size_t buf_len, char *buf,
zval *return_value TSRMLS_DC)
{
- char *temp, *tptr, *bptr, *line_end, *limit;
- const char escape_char = '\\';
+ char *delim = &delimiter, *enc = &enclosure, *buffer = buf;
+ int delim_len = 1, enc_len = 1, buffer_len = buf_len;
+ zend_uchar type = IS_STRING;
- size_t temp_len, line_end_len;
- int inc_len;
+ if (stream) {
+ type = stream->readbuf_type;
+ }
- /* initialize internal state */
- php_mblen(NULL, 0);
+ if (type == IS_UNICODE) {
+ UChar esc = '\\';
- /* Now into new section that parses buf for delimiter/enclosure fields
*/
+ /* Unicode stream, but binary delimiter/enclosures/prefetch,
promote to unicode */
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&delim, &delim_len, &delimiter, 1 TSRMLS_CC)) {
+ INIT_PZVAL(return_value);
+ return;
+ }
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&enc, &enc_len, &enclosure, 1 TSRMLS_CC)) {
+ efree(delim);
+ INIT_PZVAL(return_value);
+ return;
+ }
+ if (FAILURE ==
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)),
(UChar**)&buffer, &buffer_len, buf, buf_len TSRMLS_CC)) {
+ efree(delim);
+ efree(enc);
+ INIT_PZVAL(return_value);
+ return;
+ }
- /* Strip trailing space from buf, saving end of line in case required
for enclosure field */
+ php_u_fgetcsv(stream, (UChar*)delim, delim_len, (UChar*)enc,
enc_len, &esc, 1,
+ (UChar*)buffer, buffer_len, return_value
TSRMLS_CC);
- bptr = buf;
- tptr = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len,
delimiter TSRMLS_CC);
- line_end_len = buf_len - (size_t)(tptr - buf);
- line_end = limit = tptr;
+ /* Types converted, free storage */
+ efree(delim);
+ efree(enc);
+ efree(buffer);
+ } else {
+ /* Binary stream with binary delimiter/enclosures/prefetch */
+ php_fgetcsv_ex(stream, delim, delim_len, enc, enc_len, "\\", 1,
buffer, buffer_len, return_value TSRMLS_CC);
+ }
+}
- /* reserve workspace for building each individual field */
- temp_len = buf_len;
- temp = emalloc(temp_len + line_end_len + 1);
+typedef enum _php_fgetcsv_state {
+ PHP_FGETCSV_READY,
+ PHP_FGETCSV_FIELD_NO_ENC,
+ PHP_FGETCSV_FIELD_WITH_ENC,
+ PHP_FGETCSV_POST_ENC,
+} php_fgetcsv_state;
+
+#define PHP_FGETCSV_BIN_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 &&
*(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m),
(mlen)) == 0)))
+
+/* Binary mode fgetcsv */
+PHPAPI void php_fgetcsv_ex(php_stream *stream,
+ char *delimiter, int delimiter_len,
+ char *enclosure, int enclosure_len,
+ char *escape, int escape_len,
+ char *buffer, int buffer_len,
+ zval *return_value TSRMLS_DC)
+{
+ php_fgetcsv_state state = PHP_FGETCSV_READY;
+ char *p = buffer, *e = buffer + buffer_len, *field_start = NULL,
*field_end = NULL;
- /* Initialize return array */
array_init(return_value);
- /* Main loop to read CSV fields */
- /* NB this routine will return a single null entry for a blank line */
-
- do {
- char *comp_end, *hunk_begin;
+ while(p < e) {
+ switch (state) {
+ case PHP_FGETCSV_READY:
+ready_state:
+ /* Ready to start a new field */
+
+ /* Is there nothing left to scan? */
+ if (*p == '\r' || *p == '\n') {
+ /* Terminal delimiter, treat as empty
field */
+ p++;
+ add_next_index_stringl(return_value,
"", 0, 1);
+ break;
+ }
- tptr = temp;
+ /* Is it enclosed? */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure,
enclosure_len)) {
+ /* Enclosure encountered, switch state
*/
+ state = PHP_FGETCSV_FIELD_WITH_ENC;
+ p += enclosure_len;
+ field_start = p;
+ break;
+ }
- /* 1. Strip any leading space */
- for (;;) {
- inc_len = (bptr < limit ? (*bptr == '\0' ? 1:
php_mblen(bptr, limit - bptr)): 0);
- switch (inc_len) {
- case -2:
- case -1:
- inc_len = 1;
- php_mblen(NULL, 0);
+ /* Is it an immediate delimiter? */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, delimiter,
delimiter_len)) {
+ /* Immediate delimiter, treate as empty
field */
+ p += delimiter_len;
+ add_next_index_stringl(return_value,
"", 0, 1);
break;
- case 0:
- goto quit_loop_1;
- case 1:
- if (!isspace((int)*(unsigned char
*)bptr) || *bptr == delimiter) {
- goto quit_loop_1;
+ }
+
+ /* Whitespace? */
+ if (*p == ' ' || *p == '\t') {
+ p++;
+ if (p >= e) break;
+ goto ready_state;
+ }
+
+ /* Is it an escape character? */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, escape,
escape_len)) {
+ /* Skip escape sequence and let next
char be treated as literal */
+ p += escape_len;
+ /* FALL THROUGH */
+ }
+
+ /* Otherwise, starting a new field without
enclosures */
+ state = PHP_FGETCSV_FIELD_NO_ENC;
+ field_start = p;
+ field_end = NULL;
+ p++;
+ break;
+
+ case PHP_FGETCSV_FIELD_WITH_ENC:
+with_enc:
+ /* Check for ending enclosure */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure,
enclosure_len)) {
+ /* Enclosure encountered, is it paired?
*/
+ if (PHP_FGETCSV_BIN_CHECK(p +
enclosure_len, e, enclosure, enclosure_len)) {
+ /* Double enclosure gets
translated to single enclosure */
+ memmove(p, p + enclosure_len,
(e - p) - enclosure_len);
+ e -= enclosure_len;
+ p += enclosure_len;
+ goto with_enc;
+ } else {
+ /* Genuine end enclosure,
switch state */
+ field_end = p;
+ p += enclosure_len;
+ state = PHP_FGETCSV_POST_ENC;
+ goto post_enc;
}
+ }
+
+ /* Check for field escapes */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, escape,
escape_len)) {
+ p += escape_len + 1;
+
+ /* Reprocess for ending enclosures */
+ goto with_enc;
+ }
+
+ /* Simple character */
+ if (e - p) {
+ p++;
+ }
+
+ /* Hungry? */
+ if (((e - p) < enclosure_len) && stream) {
+ /* Feed me! */
+ int new_len;
+ char *new_buf =
php_stream_get_line(stream, NULL_ZSTR, 0, &new_len);
+
+ if (new_buf) {
+ int tmp_len = new_len + e -
field_start;
+ char *tmp = emalloc(tmp_len);
+
+ /* Realign scan buffer */
+ memcpy(tmp, field_start, e -
field_start);
+ memcpy(tmp + (e - field_start),
new_buf, new_len);
+ field_start = tmp;
+ if (field_end) {
+ field_end = tmp +
(field_end - field_start);
+ }
+ efree(buffer);
+ efree(new_buf);
+ buffer = tmp;
+ buffer_len = tmp_len;
+ p = buffer;
+ e = buffer + buffer_len;
+ }
+ }
+
+ if ((e - p) == 0) {
+ /* Nothing left to consume the buffer,
use it */
+ add_next_index_stringl(return_value,
field_start, p - field_start, 1);
+
+ /* Loop is dying anyway, but be
pedantic */
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
break;
- default:
- goto quit_loop_1;
- }
- bptr += inc_len;
+ }
+ break;
+
+ case PHP_FGETCSV_POST_ENC:
+post_enc:
+ /* Check for delimiters or EOL */
+ if (p >= e || *p == '\r' || *p == '\n' ||
PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
+ int field_len = field_end - field_start;
+ char *field;
+
+ if ((p - enclosure_len) > field_end) {
+ /* There's cruft, append it to
the proper field */
+ int cruft_len = p - (field_end
+ enclosure_len);
+
+ field = emalloc(field_len +
cruft_len + 1);
+ memcpy(field, field_start,
field_len);
+ memcpy(field + field_len,
field_end + enclosure_len, cruft_len);
+
+ field_len += cruft_len;
+ field[field_len] = 0;
+ } else {
+ field = estrndup(field_start,
field_end - field_start);
+ }
+ add_next_index_stringl(return_value,
field, field_len, 0);
+
+ /* Reset scanner */
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
+ p += delimiter_len;
+ goto ready_state;
+ }
+
+ /* Queue anything else as cruft */
+ p++;
+ break;
+
+ case PHP_FGETCSV_FIELD_NO_ENC:
+ /* Check for escapes */
+ if (PHP_FGETCSV_BIN_CHECK(p, e, escape,
escape_len)) {
+ p += escape_len + 1;
+ }
+
+ /* Check for delimiter */
+ if (p >= e || *p == '\r' || *p == '\n' ||
PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
+ add_next_index_stringl(return_value,
field_start, p - field_start, 1);
+
+ /* Reset scanner */
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
+ p += delimiter_len;
+ goto ready_state;
+ }
+
+ /* Simple character */
+ p++;
+ break;
}
+ }
- quit_loop_1:
- /* 2. Read field, leaving bptr pointing at start of next field
*/
- if (inc_len != 0 && *bptr == enclosure) {
- int state = 0;
-
- bptr++; /* move on to first character in field */
- hunk_begin = bptr;
-
- /* 2A. handle enclosure delimited field */
- for (;;) {
- switch (inc_len) {
- case 0:
- switch (state) {
- case 2:
- memcpy(tptr,
hunk_begin, bptr - hunk_begin - 1);
- tptr += (bptr -
hunk_begin - 1);
- hunk_begin =
bptr;
- goto
quit_loop_2;
-
- case 1:
- memcpy(tptr,
hunk_begin, bptr - hunk_begin);
- tptr += (bptr -
hunk_begin);
- hunk_begin =
bptr;
- /* break is
omitted intentionally */
-
- case 0: {
- char *new_buf;
- size_t new_len;
- char *new_temp;
-
- memcpy(tptr,
hunk_begin, bptr - hunk_begin);
- tptr += (bptr -
hunk_begin);
- hunk_begin =
bptr;
- if (hunk_begin
!= line_end) {
-
memcpy(tptr, hunk_begin, bptr - hunk_begin);
- tptr +=
(bptr - hunk_begin);
-
hunk_begin = bptr;
- }
-
- /* add the
embedded line end to the field */
- memcpy(tptr,
line_end, line_end_len);
- tptr +=
line_end_len;
-
- if ((new_buf =
php_stream_get_line(stream, NULL_ZSTR, 0, &new_len)) == NULL) {
- /*
we've got an unterminated enclosure,
- *
assign all the data from the start of
- * the
enclosure to end of data to the
- * last
element */
- if
((size_t)temp_len > (size_t)(limit - buf)) {
-
goto quit_loop_2;
- }
-
zval_dtor(return_value);
-
RETVAL_FALSE;
- goto
out;
- }
- temp_len +=
new_len;
- new_temp =
erealloc(temp, temp_len);
- tptr = new_temp
+ (size_t)(tptr - temp);
- temp = new_temp;
-
- efree(buf);
- buf_len =
new_len;
- bptr = buf =
new_buf;
- hunk_begin =
buf;
+ efree(buffer);
+}
+/* }}} */
- line_end =
limit = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter
TSRMLS_CC);
- line_end_len =
buf_len - (size_t)(limit - buf);
+#define PHP_FGETCSV_UNI_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 &&
*(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m),
UBYTES(mlen)) == 0)))
- state = 0;
- } break;
- }
- break;
+/* Unicode mode fgetcsv */
+PHPAPI void php_u_fgetcsv(php_stream *stream,
+ UChar *delimiter, int delimiter_len,
+ UChar *enclosure, int enclosure_len,
+ UChar *escape, int escape_len,
+ UChar *buffer, int buffer_len,
+ zval *return_value TSRMLS_DC)
+{
+ php_fgetcsv_state state = PHP_FGETCSV_READY;
+ UChar *p = buffer, *e = buffer + buffer_len, *field_start = NULL,
*field_end = NULL;
- case -2:
- case -1:
- php_mblen(NULL, 0);
- /* break is omitted
intentionally */
- case 1:
- /* we need to determine if the
enclosure is
- * 'real' or is it escaped */
- switch (state) {
- case 1: /* escaped */
- bptr++;
- state = 0;
- break;
- case 2: /* embedded
enclosure ? let's check it */
- if (*bptr !=
enclosure) {
- /* real
enclosure */
-
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
- tptr +=
(bptr - hunk_begin - 1);
-
hunk_begin = bptr;
- goto
quit_loop_2;
- }
- memcpy(tptr,
hunk_begin, bptr - hunk_begin);
- tptr += (bptr -
hunk_begin);
- bptr++;
- hunk_begin =
bptr;
- state = 0;
- break;
- default:
- if (*bptr ==
escape_char) {
- state =
1;
- } else if
(*bptr == enclosure) {
- state =
2;
- }
- bptr++;
- break;
- }
- break;
+ array_init(return_value);
- default:
- switch (state) {
- case 2:
- /* real
enclosure */
- memcpy(tptr,
hunk_begin, bptr - hunk_begin - 1);
- tptr += (bptr -
hunk_begin - 1);
- hunk_begin =
bptr;
- goto
quit_loop_2;
- case 1:
- bptr += inc_len;
- memcpy(tptr,
hunk_begin, bptr - hunk_begin);
- tptr += (bptr -
hunk_begin);
- hunk_begin =
bptr;
- break;
- default:
- bptr += inc_len;
- break;
- }
- break;
+ while(p < e) {
+ switch (state) {
+ case PHP_FGETCSV_READY:
+ready_state:
+ /* Ready to start a new field */
+
+ /* Is there nothing left to scan? */
+ if (*p == '\r' || *p == '\n') {
+ /* Terminal delimiter, treat as empty
field */
+ p++;
+ add_next_index_stringl(return_value,
"", 0, 1);
+ break;
}
- inc_len = (bptr < limit ? (*bptr == '\0' ? 1:
php_mblen(bptr, limit - bptr)): 0);
- }
- quit_loop_2:
- /* look up for a delimiter */
- for (;;) {
- switch (inc_len) {
- case 0:
- goto quit_loop_3;
-
- case -2:
- case -1:
- inc_len = 1;
- php_mblen(NULL, 0);
- /* break is omitted
intentionally */
- case 1:
- if (*bptr == delimiter) {
- goto quit_loop_3;
- }
- break;
- default:
- break;
+ /* Is it enclosed? */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure,
enclosure_len)) {
+ /* Enclosure encountered, switch state
*/
+ state = PHP_FGETCSV_FIELD_WITH_ENC;
+ p += enclosure_len;
+ field_start = p;
+ break;
}
- bptr += inc_len;
- inc_len = (bptr < limit ? (*bptr == '\0' ? 1:
php_mblen(bptr, limit - bptr)): 0);
- }
- quit_loop_3:
- memcpy(tptr, hunk_begin, bptr - hunk_begin);
- tptr += (bptr - hunk_begin);
- bptr += inc_len;
- comp_end = tptr;
- } else {
- /* 2B. Handle non-enclosure field */
+ /* Is it an immediate delimiter? */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, delimiter,
delimiter_len)) {
+ /* Immediate delimiter, treate as empty
field */
+ p += delimiter_len;
+ add_next_index_unicodel(return_value,
(UChar*)"", 0, 1);
+ break;
+ }
+
+ /* Whitespace? */
+ if (*p == ' ' || *p == '\t') {
+ p++;
+ if (p >= e) break;
+ goto ready_state;
+ }
+
+ /* Is it an escape character? */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, escape,
escape_len)) {
+ /* Skip escape sequence and let next
char be treated as literal */
+ p += escape_len;
+ /* FALL THROUGH */
+ }
+
+ /* Otherwise, starting a new field without
enclosures */
+ state = PHP_FGETCSV_FIELD_NO_ENC;
+ field_start = p;
+ field_end = NULL;
+ p++;
+ break;
+
+ case PHP_FGETCSV_FIELD_WITH_ENC:
+with_enc:
+ /* Check for ending enclosure */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure,
enclosure_len)) {
+ /* Enclosure encountered, is it paired?
*/
+ if (PHP_FGETCSV_UNI_CHECK(p +
enclosure_len, e, enclosure, enclosure_len)) {
+ /* Double enclosure gets
translated to single enclosure */
+ memmove(p, p + enclosure_len,
(e - p) - enclosure_len);
+ e -= enclosure_len;
+ p += enclosure_len;
+ goto with_enc;
+ } else {
+ /* Genuine end enclosure,
switch state */
+ field_end = p;
+ p += enclosure_len;
+ state = PHP_FGETCSV_POST_ENC;
+ goto post_enc;
+ }
+ }
+
+ /* Check for field escapes */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, escape,
escape_len)) {
+ p += escape_len + 1;
- hunk_begin = bptr;
+ /* Reprocess for ending enclosures */
+ goto with_enc;
+ }
- for (;;) {
- switch (inc_len) {
- case 0:
- goto quit_loop_4;
- case -2:
- case -1:
- inc_len = 1;
- php_mblen(NULL, 0);
- /* break is omitted
intentionally */
- case 1:
- if (*bptr == delimiter) {
- goto quit_loop_4;
+ /* Simple character */
+ if (e - p) {
+ p++;
+ }
+
+ /* Hungry? */
+ if (((e - p) < enclosure_len) && stream) {
+ /* Feed me! */
+ int new_len;
+ UChar *new_buf =
(UChar*)php_stream_get_line_ex(stream, IS_UNICODE, NULL_ZSTR, 0, 0, &new_len);
+
+ if (new_buf) {
+ int tmp_len = new_len + e -
field_start;
+ UChar *tmp = eumalloc(tmp_len);
+
+ /* Realign scan buffer, ick --
expensive */
+ memcpy(tmp, field_start,
UBYTES(e - field_start));
+ memcpy(tmp + (e - field_start),
new_buf, UBYTES(new_len));
+ field_start = tmp;
+ if (field_end) {
+ field_end = tmp +
(field_end - field_start);
}
- break;
- default:
- break;
+ efree(buffer);
+ efree(new_buf);
+ buffer = tmp;
+ buffer_len = tmp_len;
+ p = buffer;
+ e = buffer + buffer_len;
+ }
}
- bptr += inc_len;
- inc_len = (bptr < limit ? (*bptr == '\0' ? 1:
php_mblen(bptr, limit - bptr)): 0);
- }
- quit_loop_4:
- memcpy(tptr, hunk_begin, bptr - hunk_begin);
- tptr += (bptr - hunk_begin);
-
- comp_end = (char
*)php_fgetcsv_lookup_trailing_spaces(temp, tptr - temp, delimiter TSRMLS_CC);
- if (*bptr == delimiter) {
- bptr++;
- }
- }
- /* 3. Now pass our field back to php */
- *comp_end = '\0';
- add_next_index_stringl(return_value, temp, comp_end - temp, 1);
- } while (inc_len > 0);
+ if ((e - p) == 0) {
+ /* Nothing left to consume the buffer */
+ add_next_index_unicodel(return_value,
field_start, p - field_start, 1);
+
+ /* Loop is dying, but cleanup anyway */
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
+ break;
+ }
+ break;
-out:
- efree(temp);
- efree(buf);
+ case PHP_FGETCSV_POST_ENC:
+post_enc:
+ /* Check for delimiters or EOL */
+ if (p >= e || *p == '\r' || *p == '\n' ||
PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
+ int field_len = field_end - field_start;
+ UChar *field;
+
+ if ((p - enclosure_len) > field_end) {
+ /* There's cruft, append it to
the regular field */
+ int cruft_len = p - (field_end
+ enclosure_len);
+
+ field = eumalloc(field_len +
cruft_len + 1);
+ memcpy(field, field_start,
field_len);
+ memcpy(field + field_len,
field_end + enclosure_len, UBYTES(cruft_len));
+ field_len += cruft_len;
+ field[field_len] = 0;
+ } else {
+ field = eustrndup(field_start,
field_len);
+ }
+ add_next_index_unicodel(return_value,
field, field_len, 0);
+
+ /* Reset scanner state */
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
+ p += delimiter_len;
+ goto ready_state;
+ }
+
+ /* Queue anything else as cruft */
+ p++;
+ break;
+
+ case PHP_FGETCSV_FIELD_NO_ENC:
+ /* Check for escapes */
+ if (PHP_FGETCSV_UNI_CHECK(p, e, escape,
escape_len)) {
+ p += escape_len + 1;
+ }
+
+ /* Check for delimiter */
+ if (p >= e || *p == '\r' || *p == '\n' ||
PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
+ add_next_index_unicodel(return_value,
field_start, p - field_start, 1);
+ state = PHP_FGETCSV_READY;
+ field_start = field_end = NULL;
+ p += delimiter_len;
+ goto ready_state;
+ }
+
+ /* Simple character */
+ p++;
+ break;
+ }
+ }
+
+ efree(buffer);
}
/* }}} */
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/file.h?r1=1.101&r2=1.102&diff_format=u
Index: php-src/ext/standard/file.h
diff -u php-src/ext/standard/file.h:1.101 php-src/ext/standard/file.h:1.102
--- php-src/ext/standard/file.h:1.101 Fri Oct 13 09:55:48 2006
+++ php-src/ext/standard/file.h Tue Dec 5 04:13:46 2006
@@ -16,7 +16,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: file.h,v 1.101 2006/10/13 09:55:48 bjori Exp $ */
+/* $Id: file.h,v 1.102 2006/12/05 04:13:46 pollita Exp $ */
/* Synced with php 3.0 revision 1.30 1999-06-16 [ssb] */
@@ -77,6 +77,11 @@
PHPAPI int php_mkdir_ex(char *dir, long mode, int options TSRMLS_DC);
PHPAPI int php_mkdir(char *dir, long mode TSRMLS_DC);
PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure,
size_t buf_len, char *buf, zval *return_value TSRMLS_DC);
+PHPAPI void php_fgetcsv_ex(php_stream *stream, char *delimiter, int
delimiter_len, char *enclosure, int enclosure_len, char *escape, int escape_len,
+ char *buffer, int buffer_len, zval *return_value TSRMLS_DC);
+PHPAPI void php_u_fgetcsv(php_stream *stream, UChar *delimiter, int
delimiter_len, UChar *enclosure, int enclosure_len, UChar *escape, int
escape_len,
+ UChar *buffer, int buffer_len, zval *return_value TSRMLS_DC);
+
#define META_DEF_BUFSIZE 8192
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/tests/file/bug12556.phpt?r1=1.5&r2=1.6&diff_format=u
Index: php-src/ext/standard/tests/file/bug12556.phpt
diff -u php-src/ext/standard/tests/file/bug12556.phpt:1.5
php-src/ext/standard/tests/file/bug12556.phpt:1.6
--- php-src/ext/standard/tests/file/bug12556.phpt:1.5 Wed May 19 08:54:51 2004
+++ php-src/ext/standard/tests/file/bug12556.phpt Tue Dec 5 04:13:47 2006
@@ -46,3 +46,41 @@
2,4,5,line3
"
}
+--UEXPECT--
+array(4) {
+ [0]=>
+ unicode(1) "6"
+ [1]=>
+ unicode(1) "7"
+ [2]=>
+ unicode(1) "8"
+ [3]=>
+ unicode(5) "line1"
+}
+array(4) {
+ [0]=>
+ unicode(1) "1"
+ [1]=>
+ unicode(1) "2"
+ [2]=>
+ unicode(1) "3"
+ [3]=>
+ unicode(186) "line2
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+"
+}
+
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/tests/file/fgetcsv.phpt?r1=1.1&r2=1.2&diff_format=u
Index: php-src/ext/standard/tests/file/fgetcsv.phpt
diff -u php-src/ext/standard/tests/file/fgetcsv.phpt:1.1
php-src/ext/standard/tests/file/fgetcsv.phpt:1.2
--- php-src/ext/standard/tests/file/fgetcsv.phpt:1.1 Mon Jan 19 03:55:29 2004
+++ php-src/ext/standard/tests/file/fgetcsv.phpt Tue Dec 5 04:13:47 2006
@@ -28,7 +28,7 @@
$file = dirname(__FILE__) . 'fgetcsv.csv';
@unlink($file);
foreach ($list as $v) {
- $fp = fopen($file, "w");
+ $fp = fopen($file, "wt");
fwrite($fp, $v . "\n");
fclose($fp);
--
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php