Hi:
Enclosed is a patch to allow PCRE's preg_split to return an array of
(match, offset) pairs, if PREG_SPLIT_OFFSET_CAPTURE is or'd into the
flags parameter. Submitted for inclusion, rejection, extensive flaming,
or suggestions. :)
This is a re-send of a previous patch; the last one didn't seem to make
it to the list.
A bit of background:
I'm currently working on a cross-referencing system that uses character
offsets internally, matching entries in a word index to positions in a
file. The system captures it's word list via preg_split, excluding
certain tags, character combinations, and whitespace from indexing.
Not finding an obvious way to capture the match offsets directly, I
tried:
+ Rescanning the input string with strstr(), starting from
position(last_match) + 1, looking for the current match. While
reasonably fast at O(n), it has a major problem when the matched
string was also a part of the delimiter.
+ A somewhat involved sequence of two preg_split() calls and an
array_diff(). One split is PREG_SPLIT_DELIM_CAPTURE, and the
array_diff "finds" which strings are delimiters. The resulting array
is then scanned, keeping a running total of string lengths. This
works, but has an obviously large memory (and to a lesser extent
run-time) cost.
Alternatives (especially other plain PHP solutions) are welcome.
Otherwise - is there more than a snowball's chance of something like
this being included in a future release?
Thanks in advance,
- Dave
[EMAIL PROTECTED]
--- php-4.2.1-dist/ext/pcre/php_pcre.c Thu Feb 28 03:26:35 2002
+++ php-4.2.1/ext/pcre/php_pcre.c Fri May 17 11:28:02 2002
@@ -37,6 +37,7 @@
#define PREG_SPLIT_NO_EMPTY (1<<0)
#define PREG_SPLIT_DELIM_CAPTURE (1<<1)
+#define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
#define PREG_REPLACE_EVAL (1<<0)
@@ -100,6 +101,7 @@
REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE,
CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE,
+CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS |
CONST_PERSISTENT);
return SUCCESS;
}
@@ -1080,8 +1082,10 @@
int limit_val = -1; /* Integer value of
limit */
int no_empty = 0; /* If NO_EMPTY flag is
set */
int delim_capture = 0; /* If delimiters should be
captured */
+ int offset_capture = 0;/* If offsets should be
+captured */
int count = 0; /* Count of
matched subpatterns */
int start_offset; /* Where the new
search starts */
+ int next_offset; /* End of the last
+delimiter match + 1 */
int g_notempty = 0; /* If the match should
not be empty */
char *match, /* The current match */
*last_match; /* Location of last
match */
@@ -1102,6 +1106,7 @@
convert_to_long_ex(flags);
no_empty = Z_LVAL_PP(flags) & PREG_SPLIT_NO_EMPTY;
delim_capture = Z_LVAL_PP(flags) & PREG_SPLIT_DELIM_CAPTURE;
+ offset_capture = Z_LVAL_PP(flags) & PREG_SPLIT_OFFSET_CAPTURE;
}
}
@@ -1123,6 +1128,7 @@
/* Start at the beginning of the string */
start_offset = 0;
+ next_offset = 0;
last_match = Z_STRVAL_PP(subject);
match = NULL;
@@ -1143,9 +1149,27 @@
match = Z_STRVAL_PP(subject) + offsets[0];
if (!no_empty || &Z_STRVAL_PP(subject)[offsets[0]] !=
last_match) {
- /* Add the piece to the return value */
- add_next_index_stringl(return_value, last_match,
-
&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+
+ if (offset_capture) {
+ zval *match_pair;
+ ALLOC_ZVAL(match_pair);
+ array_init(match_pair);
+ INIT_PZVAL(match_pair);
+
+ /* Add (match, offset) to the return value */
+ add_next_index_stringl(match_pair, last_match,
+
+&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+
+ add_next_index_long(match_pair, next_offset);
+
+
+zend_hash_next_index_insert(Z_ARRVAL_P(return_value), &match_pair,
+
+ sizeof(zval *), NULL);
+
+ } else {
+ /* Add the piece to the return value */
+ add_next_index_stringl(return_value,
+last_match,
+
+&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+ }
/* One less left to do */
if (limit_val != -1)
@@ -1153,6 +1177,7 @@
}
last_match = &Z_STRVAL_PP(subject)[offsets[1]];
+ next_offset = offsets[1];
if (delim_capture) {
int i, match_len;
@@ -1185,11 +1210,32 @@
/* Advance to the position right after the last full match */
start_offset = offsets[1];
}
-
+
+
if (!no_empty || start_offset != Z_STRLEN_PP(subject))
- /* Add the last piece to the return value */
- add_next_index_string(return_value,
-
&Z_STRVAL_PP(subject)[start_offset], 1);
+ {
+ if (offset_capture) {
+ zval *match_pair;
+ ALLOC_ZVAL(match_pair);
+ array_init(match_pair);
+ INIT_PZVAL(match_pair);
+
+ /* Add the last (match, offset) pair to the return value */
+ add_next_index_string(match_pair,
+
+&Z_STRVAL_PP(subject)[start_offset], 1);
+
+ add_next_index_long(match_pair, start_offset);
+
+ zend_hash_next_index_insert(Z_ARRVAL_P(return_value),
+&match_pair,
+
+sizeof(zval *), NULL);
+
+ } else {
+ /* Add the last piece to the return value */
+ add_next_index_string(return_value,
+
+&Z_STRVAL_PP(subject)[start_offset], 1);
+ }
+ }
+
/* Clean up */
efree(offsets);
----- End forwarded message -----
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php