Package: flex
Version: 2.5.35-3
Severity: important
2.5.35-3 fails to process input that worked in 2.5.35-2.
This is causing another FTBFS issue in apertium, #504028.
Attached is the input file generated during apertium's build.
With flex_2.5.35-2 this will succeed (even on ia64), but flex_2.5.35-3
will fail on both the archs I tested (i386 and ia64).
$ /usr/bin/flex -Cfer -t >apertium_deshtml.cc < input2.txt
/usr/bin/m4:stdin:9: ERROR: end of file in string
--
dann frazier
%{
#include <cstdlib>
#include <iostream>
#include <map>
#include <vector>
#include <regex.h>
#include <string>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/ltstr.h>
using namespace std;
wstring buffer;
string symbuf = "";
bool isDot, hasWrite_dot, hasWrite_white;
FILE *formatfile;
string last;
int current;
long int offset;
vector<long int> offsets;
vector<wstring> tags;
vector<int> orders;
regex_t escape_chars;
regex_t names_regexp;
void bufferAppend(wstring &buf, string const &str)
{
symbuf.append(str);
for(size_t i = 0, limit = symbuf.size(); i < limit;)
{
wchar_t symbol;
int gap = mbtowc(&symbol, symbuf.c_str() + i, MB_CUR_MAX);
if(gap == -1)
{
if(i + MB_CUR_MAX < limit)
{
buf += L'?';
gap = 1;
}
else
{
symbuf = symbuf.substr(i);
return;
}
}
else
{
buf += symbol;
}
i += gap;
}
symbuf = "";
return;
}
void init_escape()
{
if(regcomp(&escape_chars, "[EMAIL PROTECTED]/]", REG_EXTENDED))
{
cerr << "ERROR: Illegal regular expression for escape characters" << endl;
exit(EXIT_FAILURE);
}
}
void init_tagNames()
{
if(regcomp(&names_regexp, "[a-zA-Z]+", REG_EXTENDED))
{
cerr << "ERROR: Illegal regular expression for tag-names" << endl;
exit(EXIT_FAILURE);
}
}
string backslash(string const &str)
{
string new_str = "";
for(unsigned int i = 0; i < str.size(); i++)
{
if(str[i] == '\\')
{
new_str += str[i];
}
new_str += str[i];
}
return new_str;
}
wstring escape(string const &str)
{
regmatch_t pmatch;
char const *mystring = str.c_str();
int base = 0;
wstring result = L"";
while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0))
{
bufferAppend(result, str.substr(base, pmatch.rm_so));
result += L'\\';
wchar_t micaracter;
int pos = mbtowc(&micaracter, str.c_str() + base + pmatch.rm_so,
MB_CUR_MAX);
if(pos == -1)
{
wcerr << L"Uno" << endl;
wcerr << L"Encoding error." << endl;
exit(EXIT_FAILURE);
}
result += micaracter;
base += pmatch.rm_eo;
}
bufferAppend(result, str.substr(base));
return result;
}
wstring escape(wstring const &str)
{
string dest = "";
for(size_t i = 0, limit = str.size(); i < limit; i++)
{
char symbol[MB_CUR_MAX+1];
int pos = wctomb(symbol, str[i]);
if(pos == -1)
{
symbol[0]='?';
pos = 1;
}
symbol[pos] = 0;
dest.append(symbol);
}
return escape(dest);
}
string get_tagName(string tag){
regmatch_t pmatch;
char const *mystring = tag.c_str();
string result = "";
if(!regexec(&names_regexp, mystring, 1, &pmatch, 0))
{
result=tag.substr(pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so);
return result;
}
return "";
}
map<string, wstring, Ltstr> S1_substitution;
void S1_init()
{
S1_substitution["À"] = L"??";
S1_substitution["À"] = L"??";
S1_substitution["À"] = L"??";
S1_substitution["À"] = L"??";
S1_substitution["Á"] = L"??";
S1_substitution["Á"] = L"??";
S1_substitution["Á"] = L"??";
S1_substitution["Á"] = L"??";
S1_substitution["Â"] = L"??";
S1_substitution["Â"] = L"??";
S1_substitution["Â"] = L"??";
S1_substitution["Â"] = L"??";
S1_substitution["Ã"] = L"??";
S1_substitution["Ã"] = L"??";
S1_substitution["Ã"] = L"??";
S1_substitution["Ã"] = L"??";
S1_substitution["Ä"] = L"??";
S1_substitution["Ä"] = L"??";
S1_substitution["Ä"] = L"??";
S1_substitution["Ä"] = L"??";
S1_substitution["Å"] = L"??";
S1_substitution["Å"] = L"??";
S1_substitution["Å"] = L"??";
S1_substitution["Å"] = L"??";
S1_substitution["Æ"] = L"??";
S1_substitution["Æ"] = L"??";
S1_substitution["Æ"] = L"??";
S1_substitution["Æ"] = L"??";
S1_substitution["Ç"] = L"??";
S1_substitution["Ç"] = L"??";
S1_substitution["Ç"] = L"??";
S1_substitution["Ç"] = L"??";
S1_substitution["È"] = L"??";
S1_substitution["È"] = L"??";
S1_substitution["È"] = L"??";
S1_substitution["È"] = L"??";
S1_substitution["É"] = L"??";
S1_substitution["É"] = L"??";
S1_substitution["É"] = L"??";
S1_substitution["É"] = L"??";
S1_substitution["Ê"] = L"??";
S1_substitution["Ê"] = L"??";
S1_substitution["Ê"] = L"??";
S1_substitution["Ê"] = L"??";
S1_substitution["Ë"] = L"??";
S1_substitution["Ë"] = L"??";
S1_substitution["Ë"] = L"??";
S1_substitution["Ë"] = L"??";
S1_substitution["Ì"] = L"??";
S1_substitution["Ì"] = L"??";
S1_substitution["Ì"] = L"??";
S1_substitution["Ì"] = L"??";
S1_substitution["Í"] = L"??";
S1_substitution["Í"] = L"??";
S1_substitution["Í"] = L"??";
S1_substitution["Í"] = L"??";
S1_substitution["Î"] = L"??";
S1_substitution["Î"] = L"??";
S1_substitution["Î"] = L"??";
S1_substitution["Î"] = L"??";
S1_substitution["Ï"] = L"??";
S1_substitution["Ï"] = L"??";
S1_substitution["Ï"] = L"??";
S1_substitution["Ï"] = L"??";
S1_substitution["Ð"] = L"??";
S1_substitution["Ð"] = L"??";
S1_substitution["Ð"] = L"??";
S1_substitution["Ð"] = L"??";
S1_substitution["Ñ"] = L"??";
S1_substitution["Ñ"] = L"??";
S1_substitution["Ñ"] = L"??";
S1_substitution["Ñ"] = L"??";
S1_substitution["Ò"] = L"??";
S1_substitution["Ò"] = L"??";
S1_substitution["Ò"] = L"??";
S1_substitution["Ò"] = L"??";
S1_substitution["Ó"] = L"??";
S1_substitution["Ó"] = L"??";
S1_substitution["Ó"] = L"??";
S1_substitution["Ó"] = L"??";
S1_substitution["Ô"] = L"??";
S1_substitution["Ô"] = L"??";
S1_substitution["Ô"] = L"??";
S1_substitution["Ô"] = L"??";
S1_substitution["Õ"] = L"??";
S1_substitution["Õ"] = L"??";
S1_substitution["Õ"] = L"??";
S1_substitution["Õ"] = L"??";
S1_substitution["Ö"] = L"??";
S1_substitution["Ö"] = L"??";
S1_substitution["Ö"] = L"??";
S1_substitution["Ö"] = L"??";
S1_substitution["Ø"] = L"??";
S1_substitution["Ø"] = L"??";
S1_substitution["Ø"] = L"??";
S1_substitution["Ø"] = L"??";
S1_substitution["Ù"] = L"??";
S1_substitution["Ù"] = L"??";
S1_substitution["Ù"] = L"??";
S1_substitution["Ù"] = L"??";
S1_substitution["Ú"] = L"??";
S1_substitution["Ú"] = L"??";
S1_substitution["Ú"] = L"??";
S1_substitution["Ú"] = L"??";
S1_substitution["Û"] = L"??";
S1_substitution["Û"] = L"??";
S1_substitution["Û"] = L"??";
S1_substitution["Û"] = L"??";
S1_substitution["Ü"] = L"??";
S1_substitution["Ü"] = L"??";
S1_substitution["Ü"] = L"??";
S1_substitution["Ü"] = L"??";
S1_substitution["Ý"] = L"??";
S1_substitution["Ý"] = L"??";
S1_substitution["Ý"] = L"??";
S1_substitution["Ý"] = L"??";
S1_substitution["Þ"] = L"??";
S1_substitution["Þ"] = L"??";
S1_substitution["Þ"] = L"??";
S1_substitution["Þ"] = L"??";
S1_substitution["ß"] = L"??";
S1_substitution["ß"] = L"??";
S1_substitution["ß"] = L"??";
S1_substitution["ß"] = L"??";
S1_substitution["à"] = L"??";
S1_substitution["à"] = L"??";
S1_substitution["à"] = L"??";
S1_substitution["à"] = L"??";
S1_substitution["á"] = L"??";
S1_substitution["á"] = L"??";
S1_substitution["á"] = L"??";
S1_substitution["á"] = L"??";
S1_substitution["â"] = L"??";
S1_substitution["â"] = L"??";
S1_substitution["â"] = L"??";
S1_substitution["â"] = L"??";
S1_substitution["ã"] = L"??";
S1_substitution["ã"] = L"??";
S1_substitution["ã"] = L"??";
S1_substitution["ã"] = L"??";
S1_substitution["ä"] = L"??";
S1_substitution["ä"] = L"??";
S1_substitution["ä"] = L"??";
S1_substitution["ä"] = L"??";
S1_substitution["å"] = L"??";
S1_substitution["å"] = L"??";
S1_substitution["å"] = L"??";
S1_substitution["å"] = L"??";
S1_substitution["æ"] = L"??";
S1_substitution["æ"] = L"??";
S1_substitution["æ"] = L"??";
S1_substitution["æ"] = L"??";
S1_substitution["ç"] = L"??";
S1_substitution["ç"] = L"??";
S1_substitution["ç"] = L"??";
S1_substitution["ç"] = L"??";
S1_substitution["è"] = L"??";
S1_substitution["è"] = L"??";
S1_substitution["è"] = L"??";
S1_substitution["è"] = L"??";
S1_substitution["é"] = L"??";
S1_substitution["é"] = L"??";
S1_substitution["é"] = L"??";
S1_substitution["é"] = L"??";
S1_substitution["ê"] = L"??";
S1_substitution["ê"] = L"??";
S1_substitution["ê"] = L"??";
S1_substitution["ê"] = L"??";
S1_substitution["ë"] = L"??";
S1_substitution["ë"] = L"??";
S1_substitution["ë"] = L"??";
S1_substitution["ë"] = L"??";
S1_substitution["ì"] = L"??";
S1_substitution["ì"] = L"??";
S1_substitution["ì"] = L"??";
S1_substitution["ì"] = L"??";
S1_substitution["í"] = L"??";
S1_substitution["í"] = L"??";
S1_substitution["í"] = L"??";
S1_substitution["í"] = L"??";
S1_substitution["î"] = L"??";
S1_substitution["î"] = L"??";
S1_substitution["î"] = L"??";
S1_substitution["î"] = L"??";
S1_substitution["ï"] = L"??";
S1_substitution["ï"] = L"??";
S1_substitution["ï"] = L"??";
S1_substitution["ï"] = L"??";
S1_substitution["ð"] = L"??";
S1_substitution["ð"] = L"??";
S1_substitution["ð"] = L"??";
S1_substitution["ð"] = L"??";
S1_substitution["ñ"] = L"??";
S1_substitution["ñ"] = L"??";
S1_substitution["ñ"] = L"??";
S1_substitution["ñ"] = L"??";
S1_substitution["ò"] = L"??";
S1_substitution["ò"] = L"??";
S1_substitution["ò"] = L"??";
S1_substitution["ò"] = L"??";
S1_substitution["ó"] = L"??";
S1_substitution["ó"] = L"??";
S1_substitution["ó"] = L"??";
S1_substitution["ó"] = L"??";
S1_substitution["ô"] = L"??";
S1_substitution["ô"] = L"??";
S1_substitution["ô"] = L"??";
S1_substitution["ô"] = L"??";
S1_substitution["õ"] = L"??";
S1_substitution["õ"] = L"??";
S1_substitution["õ"] = L"??";
S1_substitution["õ"] = L"??";
S1_substitution["ö"] = L"??";
S1_substitution["ö"] = L"??";
S1_substitution["ö"] = L"??";
S1_substitution["ö"] = L"??";
S1_substitution["ø"] = L"??";
S1_substitution["ø"] = L"??";
S1_substitution["ø"] = L"??";
S1_substitution["ø"] = L"??";
S1_substitution["ù"] = L"??";
S1_substitution["ù"] = L"??";
S1_substitution["ù"] = L"??";
S1_substitution["ù"] = L"??";
S1_substitution["ú"] = L"??";
S1_substitution["ú"] = L"??";
S1_substitution["ú"] = L"??";
S1_substitution["ú"] = L"??";
S1_substitution["û"] = L"??";
S1_substitution["û"] = L"??";
S1_substitution["û"] = L"??";
S1_substitution["û"] = L"??";
S1_substitution["ü"] = L"??";
S1_substitution["ü"] = L"??";
S1_substitution["ü"] = L"??";
S1_substitution["ü"] = L"??";
S1_substitution["ý"] = L"??";
S1_substitution["ý"] = L"??";
S1_substitution["ý"] = L"??";
S1_substitution["ý"] = L"??";
S1_substitution["þ"] = L"??";
S1_substitution["þ"] = L"??";
S1_substitution["þ"] = L"??";
S1_substitution["þ"] = L"??";
S1_substitution["ÿ"] = L"??";
S1_substitution["ÿ"] = L"??";
S1_substitution["ÿ"] = L"??";
S1_substitution["ÿ"] = L"??";
S1_substitution["·"] = L"??";
S1_substitution["·"] = L"??";
S1_substitution["·"] = L"??";
S1_substitution["·"] = L"??";
S1_substitution["’"] = L"'";
}
void printBuffer()
{
if(isDot)
{
fputws_unlocked(L".[]", yyout);
isDot = false;
}
if(buffer.size() > 8192)
{
string filename = tmpnam(NULL);
FILE *largeblock = fopen(filename.c_str(), "w");
fputws_unlocked(buffer.c_str(), largeblock);
fclose(largeblock);
fputwc_unlocked(L'[', yyout);
fputwc_unlocked(L'@', yyout);
wchar_t cad[filename.size()];
size_t pos = mbstowcs(cad, filename.c_str(), filename.size());
if(pos == (size_t) -1)
{
wcerr << L"Tres" << endl;
wcerr << L"Encoding error." << endl;
exit(EXIT_FAILURE);
}
cad[pos] = 0;
fputws_unlocked(cad, yyout);
fputwc_unlocked(L']', yyout);
}
else if(buffer.size() > 1)
{
fputwc_unlocked(L'[', yyout);
wstring const tmp = escape(buffer);
if(tmp[0] == L'@')
{
fputwc_unlocked(L'\\', yyout);
}
fputws_unlocked(tmp.c_str(), yyout);
fputwc_unlocked(L']', yyout);
}
else if(buffer.size() == 1 && buffer[0] != L' ')
{
fputwc_unlocked(L'[', yyout);
wstring const tmp = escape(buffer);
if(tmp[0] == L'@')
{
fputwc_unlocked(L'\\', yyout);
}
fputws_unlocked(tmp.c_str(), yyout);
fputwc_unlocked(L']', yyout);
}
else
{
fputws_unlocked(buffer.c_str(), yyout);
}
buffer = L"";
}
%}
%x C1 C2 C3
%option nounput
%option noyywrap
%option caseless
%option stack
%%
<C1>{
"-->" {
last = "buffer";
bufferAppend(buffer, yytext);
yy_pop_state();
}
\n|. {
last = "buffer";
bufferAppend(buffer, yytext);
}
}
<C2>{
"<!--" {
bufferAppend(buffer, yytext);
yy_push_state(C1);
}
"</script"(" "[^>]*)?">" {
last = "buffer";
bufferAppend(buffer, yytext);
yy_pop_state();
}
\n|. {
last = "buffer";
bufferAppend(buffer, yytext);
}
}
<C3>{
"<!--" {
bufferAppend(buffer, yytext);
yy_push_state(C1);
}
"</style"(" "[^>]*)?">" {
last = "buffer";
bufferAppend(buffer, yytext);
yy_pop_state();
}
\n|. {
last = "buffer";
bufferAppend(buffer, yytext);
}
}
"<!--" {
bufferAppend(buffer, yytext);
yy_push_state(C1);
}
"<script"(" "[^>]*)?">" {
bufferAppend(buffer, yytext);
yy_push_state(C2);
}
"<style"(" "[^>]*)?">" {
bufferAppend(buffer, yytext);
yy_push_state(C3);
}
"<br"(" "[^>]*)?">"|"<hr"(" "[^>]*)?">"|"<p"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"<li"(" "[^>]*)?">"|"<ul"(" "[^>]*)?">"|"<ol"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"<tr"(" "[^>]*)?">"|"<td"(" "[^>]*)?">"|"<th"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"</br"(" "[^>]*)?">"|"</hr"(" "[^>]*)?">"|"</p"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"</li"(" "[^>]*)?">"|"</ul"(" "[^>]*)?">"|"</ol"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"</tr"(" "[^>]*)?">"|"</td"(" "[^>]*)?">"|"</th"(" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"<title"(" "[^>]*)?">"|"<div"(" "[^>]*)?">"|"<option"(" "[^>]*)?">"|"<h"[1-6]("
"[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"</title"(" "[^>]*)?">"|"</div"(" "[^>]*)?">"|"</option"("
"[^>]*)?">"|"</h"[1-6](" "[^>]*)?">" {
isDot = true;
bufferAppend(buffer, yytext);
}
"<"("img"|"link")(" "[^>]*)?">" {
bufferAppend(buffer, yytext);
}
("<!"|"<?")[a-zA-Z][^>]*">" {
bufferAppend(buffer, yytext);
}
"<"[a-zA-Z][^>]*">" {
bufferAppend(buffer, yytext);
}
"</"[a-zA-Z][^>]*">" {
bufferAppend(buffer, yytext);
}
"&"([a-zA-Z]+|"#x"[0-9a-fA-F]{1,4}|"#"[0-9]{1,8}); {
if(S1_substitution.find(yytext) != S1_substitution.end())
{
printBuffer();
fputws_unlocked(S1_substitution[yytext].c_str(), yyout);
offset+=S1_substitution[yytext].size();
hasWrite_dot = hasWrite_white = true;
}
else
{
last="buffer";
bufferAppend(buffer, yytext);
}
}
[ \n\t\r$*<>] {
if (last == "open_tag")
bufferAppend(tags.back(), yytext);
else
bufferAppend(buffer, yytext);
}
[EMAIL PROTECTED]/] {
printBuffer();
fputwc_unlocked(L'\\', yyout);
offset++;
wchar_t symbol;
int pos = mbtowc(&symbol, yytext, MB_CUR_MAX);
if(pos == -1)
{
wcerr << L"Cuatro" << endl;
wcerr << L"Encoding error." << endl;
exit(EXIT_FAILURE);
}
fputwc_unlocked(symbol, yyout);
offset++;
hasWrite_dot = hasWrite_white = true;
}
. {
printBuffer();
symbuf += yytext;
wchar_t symbol;
int pos = mbtowc(&symbol, symbuf.c_str(), MB_CUR_MAX);
if(pos == -1)
{
if(symbuf.size() > MB_CUR_MAX)
{
// unknown character
symbuf = "";
fputwc_unlocked(L'?', yyout);
offset++;
hasWrite_dot = hasWrite_white = true;
}
}
else
{
symbuf = "";
fputwc_unlocked(symbol, yyout);
offset++;
hasWrite_dot = hasWrite_white = true;
}
}
<<EOF>> {
isDot = true;
printBuffer();
return 0;
}
%%
void usage(string const &progname)
{
cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl;
cerr << "html format processor " << endl;
exit(EXIT_SUCCESS);
}
int main(int argc, char *argv[])
{
LtLocale::tryToSetLocale();
if(argc > 3)
{
usage(argv[0]);
}
switch(argc)
{
case 3:
yyout = fopen(argv[2], "w");
if(!yyout)
{
usage(argv[0]);
}
case 2:
yyin = fopen(argv[1], "r");
if(!yyin)
{
usage(argv[0]);
}
break;
default:
break;
}
// prevent warning message
yy_push_state(1);
yy_top_state();
yy_pop_state();
S1_init();
last = "";
buffer = L"";
isDot = hasWrite_dot = hasWrite_white = false;
current=0;
offset = 0;
init_escape();
init_tagNames();
yylex();
fclose(yyin);
fclose(yyout);
}