I made the changes needed to use UTF-32 instead. It turned out that the PCRE version 1 API I was using does not properly support UTF-32 patterns (only match data). Thus, I changed the code to use version 2 instead.
I have attached the two files that I changed. It works, as can be seen in the below example, but it's nowhere near complete. * '(..)..(..)⍱$' ⎕RE "footesting⌽⍱"* ┏→━━━━━━━━┓ ┃"st" "g⌽"┃ ┗∊━━━━━━━━┛ Now, there are two changes I would like to see: - If the right-hand argument is an array of strings, the pattern should be applied to all strings, collecting the results into a 2D array. This will be quite efficient, since the pattern only needs to be compiled once. - I'd like an axis-argument with options. One of those options should be a flag that causes a mismatch to yield an error instead of ⍬. This would be useful when the regex check is used to extract data out of data which is expected to follow a given pattern (think one-liners in interactive mode). The reason I haven't implemented these myself is because I find the current code to be absolutely awful, especially with all the duplicated code to deallocate PCRE structures. In Lisp I'd use an UNWIND-PROTECT (or try/finally in Java), but in C++ I think I have to declare a new class with a destructor to handle this, correct? Is there anyone who would like to clean this up? Regards, Elias On 21 September 2017 at 19:39, Juergen Sauermann < [email protected]> wrote: > Hi Elias, > > the UTF8_constructors look OK, but it can be tricky to properly interpret > indices (the elements of sub in your code) of > UTF8-encoded strings (i.e whether they mean code points or byte offsets). > > My feeling is that you should avoid UTF8_strings completely and go for the > UTF32 option of the library (assuming that > UTF32 are codepoints encoded as 32 bit integers). APL character strings > are almost UTF32 strings (except for gaps between > the codepoints) and they avoid all the bits shifting needed for UTF8 > strings. > > Best Regards, > /// Jürgen > > > On 09/21/2017 12:09 PM, Elias Mårtenson wrote: > > I've implemented the bare minimal needed to get regexes working through > a ⎕RE function. I've attached the diff. > > I really need Jürgen to take a look at this, since my code that constructs > the return value cannot possibly be correct. There must be a better way to > handle this which does not involve conversion back and forth between > std::string. > > Also, I have the result in an UTF-8-encoded C string, and I try to create > an UTF8_string from it like this: > > Value_P field_value(UTF8_string(field.c_str()), LOC); > > However, when I test this in APL I get the following result: > > '(..)..(..)$' ⎕RE 'sdklfjfj⍉' > ┏→━━━━━━━━━━┓ > ┃"lf" "jâ\215\211"┃ > ┗∊━━━━━━━━━━┛ > > It seems the UTF-8 conversion is not done correctly by the UTF8_string > constructor. What did I do wrong? > > Regards, > Elias > > On 21 September 2017 at 11:38, Xiao-Yong Jin <[email protected]> > wrote: > >> >> > On Sep 20, 2017, at 9:19 PM, Peter Teeson <[email protected]> >> wrote: >> > >> > (These days performance can hardly be a compelling argument >> > with multiple many-core CPU chips.) >> >> This kind of argument for APL is exactly why Fortran is still alive and >> well. >> >> > >
#include "Quad_RE.hh"
#include "Workspace.hh"
#include "PointerCell.hh"
#define PCRE2_CODE_UNIT_WIDTH 32
#include <pcre2.h>
Quad_RE Quad_RE::_fun;
Quad_RE *Quad_RE::fun = &Quad_RE::_fun;
static const PCRE2_UCHAR32 *ucs_to_codepoints(const UCS_string &string)
{
int size = string.size();
PCRE2_UCHAR32 *buf = new PCRE2_UCHAR32[size];
PCRE2_UCHAR32 *p = buf;
UCS_string::iterator i = string.begin();
while(i.more()) {
*p++ = i.next();
}
return buf;
}
static UCS_string make_ucs_string(PCRE2_UCHAR32 *buf)
{
UCS_string result;
PCRE2_UCHAR32 *p = buf;
while(*p != 0) {
result.append(static_cast<Unicode>(*p++));
}
return result;
}
Quad_RE::Quad_RE() : QuadFunction(TOK_Quad_RE)
{
}
Token Quad_RE::eval_AB(Value_P A, Value_P B)
{
if(!A->is_char_string()) {
MORE_ERROR() << "Regexp argument must be a string value";
VALUE_ERROR;
}
UCS_string pattern = A->get_UCS_ravel();
const PCRE2_UCHAR32 *pattern_ucs = ucs_to_codepoints(pattern);
int error_code;
PCRE2_SIZE error_offset;
pcre2_code *code = pcre2_compile_32(pattern_ucs, pattern.size(), PCRE2_NO_UTF_CHECK, &error_code, &error_offset, NULL);
delete[] pattern_ucs;
if(code == NULL) {
PCRE2_UCHAR32 buf[256];
pcre2_get_error_message_32(error_code, buf, sizeof(buf));
UCS_string error_message = make_ucs_string(buf);
MORE_ERROR() << "Error compiling regex at offset: " << error_offset << ": " << error_message;
VALUE_ERROR;
}
const Shape &shape = B->get_shape();
if(shape.get_rank() == 0) {
return Token(TOK_APL_VALUE1, Idx0(LOC));
}
else if(B->is_char_string()) {
UCS_string matched = B->get_UCS_ravel();
const PCRE2_UCHAR32 *matched_ucs = ucs_to_codepoints(matched);
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern_32(code, NULL);
int match_result = pcre2_match_32(code, matched_ucs, matched.size(), 0, 0, match_data, NULL);
if(match_result < 0) {
delete[] matched_ucs;
pcre2_match_data_free(match_data);
pcre2_code_free(code);
return Token(TOK_APL_VALUE1, Idx0(LOC));
}
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(match_data);
if(match_result == 0) {
delete[] matched_ucs;
pcre2_match_data_free(match_data);
pcre2_code_free(code);
MORE_ERROR() << "Match buffer too small";
FIXME;
}
if(match_result == 1) {
UCS_string result(reinterpret_cast<const Unicode *>(matched_ucs + ovector[0]), ovector[1] - ovector[0]);
delete[] matched_ucs;
pcre2_match_data_free(match_data);
pcre2_code_free(code);
Value_P result_value(result, LOC);
result_value->check_value(LOC);
return Token(TOK_APL_VALUE1, result_value);
}
else {
Shape shape(match_result - 1);
Value_P result_value(shape, LOC);
for(int i = 1 ; i < match_result ; i++) {
PCRE2_SIZE start = ovector[i * 2];
PCRE2_SIZE end = ovector[i * 2 + 1];
Value_P field_value(UCS_string(reinterpret_cast<const Unicode *>(matched_ucs + start), end - start), LOC);
field_value->check_value(LOC);
new (result_value->next_ravel()) PointerCell(field_value, result_value.getref());
}
delete[] matched_ucs;
pcre2_match_data_free(match_data);
pcre2_code_free(code);
result_value->check_value(LOC);
return Token(TOK_APL_VALUE1, result_value);
}
}
else {
pcre2_code_free(code);
VALENCE_ERROR;
}
#if 0
const Shape &shape = B->get_shape();
if(shape.get_rank() == 0) {
return Token(TOK_APL_VALUE1, Str0(LOC));
}
else if(B->is_char_string()) {
UCS_string matched = B->get_UCS_ravel();
cout << "will match: '" << matched << "' against '" << pattern << "'" << endl;
int *matched_ucs = ucs_to_codepoints(matched);
int match_result = pcre_exec(code, extra, reinterpret_cast<char *>(matched_ucs), matched.size(),
0, 0, sub, sizeof(sub) / sizeof(sub[0]));
cout << "n = " << match_result << endl;
if(match_result < 0) {
delete[] matched_ucs;
return Token(TOK_APL_VALUE1, Idx0(LOC));
}
if(match_result == 1) {
// No subexpressions, return the entire matched string
UCS_string result(reinterpret_cast<Unicode *>(matched_ucs + sub[0]), sub[1] - sub[0]);
delete[] matched_ucs;
Value_P result_value(result, LOC);
result_value->check_value(LOC);
return Token(TOK_APL_VALUE1, result_value);
}
else {
Shape shape(match_result - 1);
Value_P result_value(shape, LOC);
for(int i = 1 ; i < match_result ; i++) {
size_t start = sub[i * 2];
size_t end = sub[i * 2 + 1];
Value_P field_value(UCS_string(reinterpret_cast<Unicode *>(matched_ucs + start), end - start), LOC);
field_value->check_value(LOC);
new (result_value->next_ravel()) PointerCell(field_value, result_value.getref());
}
delete[] matched_ucs;
result_value->check_value(LOC);
return Token(TOK_APL_VALUE1, result_value);
}
}
else {
VALUE_ERROR;
}
#endif
}
Token
Quad_RE::eval_AXB(const Value_P A, const Value_P X, const Value_P B)
{
return Token( TOK_APL_VALUE1, Str0( LOC ) );
}
Token
Quad_RE::eval_B(Value_P B)
{
return Token( TOK_APL_VALUE1, Str0( LOC ) );
}
Token
Quad_RE::eval_XB(Value_P X, Value_P B)
{
return Token( TOK_APL_VALUE1, Str0( LOC ) );
}
ax_path_lib_pcre.m4
Description: application/m4
