Hi, I've attached some files that partially implement a tokeniser for PDF files. If there's interest, I can clean it up for inclusion. The .h file describes the state machine that's used.
This isn't integrated into the build system right now, but the Makefile should work if you fix the include path and copy pdf-stm-buffer.c from GNU PDF. toktest reads a PDF from stdin and prints the tokens. -- Michael
/* -*- mode: C -*- Time-stamp: "2009-01-05 08:53:02 mgold" * * File: pdf-tokeniser.h * Date: Mon Dec 29 00:45:09 2008 * * GNU PDF Library - Stream tokeniser * */ /* Copyright (C) 2008 Michael Gold */ /* This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef PDF_TOKENISER #define PDF_TOKENISER #include <pdf-types.h> #include <pdf-stm-buffer.h> enum pdf_tokeniser_state_e { PDF_TOKENISER_STATE_NONE = 0, PDF_TOKENISER_STATE_WSPACE, PDF_TOKENISER_STATE_COMMENT, PDF_TOKENISER_STATE_KEYWORD, PDF_TOKENISER_STATE_NAME, PDF_TOKENISER_STATE_STRING, PDF_TOKENISER_STATE_HEXSTRING, PDF_TOKENISER_STATE_DICTEND, PDF_TOKENISER_STATE_PENDING }; /* Tokeniser states (from pdf_tokeniser_state_e): * NONE - Initial state: not reading a token. * WSPACE - Reading whitespace into buffer. * COMMENT - Reading a comment. buffer collects the comment bytes, including * the initial '%'. * KEYWORD - Reading some regular characters into buffer; this could result * in a symbol like "null", or a number. * NAME - Reading a name (which starts with '/'). * Substates: * 0 - normal state * 1 - just read a '#' (escape prefix) * 2 - read the first hex digit after '#'; the value is in charparam * buffer collects the name, excluding the initial '/'. * STRING - Reading a literal string (enclosed in '(', ')'). * Substates: * 0 - normal state * 1 - ignore the next byte if its value is 10 (ASCII LF; * this is used to treat CRLF as a single line ending) * 2 - just saw a backslash (escape prefix) * 3 - read 1 octal digit; the value is in charparam * 4 - read 2 octal digits; the value is in charparam * intparam is the bracket nesting level; ')' at level 0 ends the string. * buffer collects the string. * HEXSTRING - Reading a hex string. * Substates: * 0 - initial state: we just saw the opening '<', and if the next byte is * also '<' this is the start of a dictionary rather than a string * 1 - normal state (the next hex digit will be the first in a pair) * 2 - read the first hex digit; its value is in charparam * 3 - end state; saw the closing '>' * buffer collects the string. * DICTEND - Just got a '>'; expecting another. * Substates: * 0 - starting state * 1 - saw the second '>' * PENDING - Need to emit a token (determined by charparam) ASAP. */ enum pdf_tokeniser_flag_e { PDF_TOKENISER_FLAG_RET_COMMENTS = 1, /* return comments as tokens */ PDF_TOKENISER_FLAG_PDF11 = 2, /* disallow '#' escapes in names */ }; /* Internal state */ struct pdf_tokeniser_s { int flags; /* miscellaneous settings (from pdf_tokeniser_flag_e) */ int state; int substate; pdf_char_t charparam; int intparam; pdf_stm_buffer_t buffer; }; //TODO: use pdf_obj structures for tokens enum pdf_token_type_e { PDF_TOKEN_TYPE_WSPACE, PDF_TOKEN_TYPE_COMMENT, PDF_TOKEN_TYPE_KEYWORD, PDF_TOKEN_TYPE_INTEGER, PDF_TOKEN_TYPE_REAL, PDF_TOKEN_TYPE_NAME, PDF_TOKEN_TYPE_STRING, PDF_TOKEN_TYPE_DICT_START, PDF_TOKEN_TYPE_DICT_END, PDF_TOKEN_TYPE_ARRAY_START, PDF_TOKEN_TYPE_ARRAY_END, PDF_TOKEN_TYPE_PROC_START, PDF_TOKEN_TYPE_PROC_END, }; struct pdf_token_s { char type; char _priv_flags; union { pdf_stm_buffer_t buffer; double real; int integer; } value; }; /* BEGIN PUBLIC */ typedef struct pdf_tokeniser_s *pdf_tokeniser_t; pdf_status_t pdf_tokeniser_new(pdf_tokeniser_t *context); pdf_status_t pdf_tokeniser_destroy(pdf_tokeniser_t context); typedef struct pdf_token_s *pdf_token_t; pdf_status_t read_token(pdf_tokeniser_t context, pdf_stm_buffer_t in, pdf_token_t *token, pdf_bool_t finish_p); /* END PUBLIC */ #endif /* End of pdf_tokeniser.h */
/* -*- mode: C -*- Time-stamp: "2009-01-05 08:53:24 mgold" * * File: pdf-tokeniser.c * Date: Mon Dec 29 00:45:09 2008 * * GNU PDF Library - Stream tokeniser * */ /* Copyright (C) 2008 Michael Gold */ /* This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define PDF_EBADFILE 90 /*XXX*/ #define PDF_EIMPLLIMIT 91 /*XXX*/ #include "config.h" #include <assert.h> #include <string.h> #include "pdf-stm-buffer.h" #include "pdf-tokeniser.h" static INLINE int can_store_char(pdf_tokeniser_t context); static INLINE pdf_status_t store_char(pdf_tokeniser_t context, pdf_char_t ch); static pdf_status_t exit_state(pdf_tokeniser_t context, pdf_token_t *token); static INLINE pdf_status_t handle_char( pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token); static INLINE pdf_status_t handle_string_char( pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token); static INLINE pdf_status_t handle_hexstring_char( pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token); pdf_status_t pdf_tokeniser_new(pdf_tokeniser_t *context) { int err; pdf_tokeniser_t new; err = PDF_ENOMEM; new = pdf_alloc(sizeof(**context)); if (!new) goto fail; /* max string size 32767 + terminating '\0' */ new->buffer = pdf_stm_buffer_new(32768); if (!new->buffer) goto fail; new->buffer->data[new->buffer->size-1] = 0; new->flags = 0; new->state = PDF_TOKENISER_STATE_NONE; new->substate = 0; *context = new; return PDF_OK; fail: if (new) pdf_dealloc(new); return err; } pdf_status_t pdf_tokeniser_destroy(pdf_tokeniser_t context) { if (!context) return PDF_EBADDATA; assert(context->buffer); if (context->buffer) pdf_stm_buffer_destroy(context->buffer); pdf_dealloc(context); return PDF_OK; } pdf_status_t pdf_token_destroy(pdf_token_t token) { if (!token) return PDF_EBADDATA; if (token->_priv_flags & 1) pdf_stm_buffer_destroy(token->value.buffer); pdf_dealloc(token); return PDF_OK; } static INLINE int pdf_is_wspace_char(pdf_char_t ch) { /* ASCII codes for NUL, HT, LF, FF, CR, SP */ return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32); } static INLINE int pdf_is_delim_char(pdf_char_t ch) { /* ASCII codes for '%', '(', ')', '/'; '<', '>', '[', ']'; '{', '}' */ return (ch == 37 || ch == 40 || ch == 41 || ch == 47 || ch == 60 || ch == 62 || ch == 91 || ch == 93 || ch == 123 || ch == 125); } static INLINE int pdf_is_eol_char(pdf_char_t ch) { return ch == 10 || ch == 13; } static INLINE int pdf_is_regular_char(pdf_char_t ch) { return !pdf_is_wspace_char(ch) && !pdf_is_delim_char(ch); } static INLINE pdf_char_t hexval(pdf_char_t ch) { if (ch >= 48 && ch <= 48+9) /* '0'--'9' */ return ch - 48; if (ch >= 64+1 && ch <= 64+6) /* 'A'--'F' */ return ch - (64+1) + 10; if (ch >= 96+1 && ch <= 96+6) /* 'a'--'f' */ return ch - (96+1) + 10; return 255; } static INLINE pdf_status_t handle_char(pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token) { pdf_status_t rv; /* first, handle the states that shouldn't be exited when whitespace * or a delimiter is seen */ switch (context->state) { case PDF_TOKENISER_STATE_STRING: return handle_string_char(context, ch, token); case PDF_TOKENISER_STATE_HEXSTRING: return handle_hexstring_char(context, ch, token); case PDF_TOKENISER_STATE_DICTEND: if (ch != 62) /* '>' */ return PDF_EBADFILE; context->substate = 1; /* saw the closing '>' */ return exit_state(context, token); case PDF_TOKENISER_STATE_COMMENT: if (context->substate == 1) { rv = store_char(context, context->charparam); if (rv != PDF_OK) return rv; context->substate = 0; } if (pdf_is_eol_char(ch)) { rv = exit_state(context, token); if (rv != PDF_OK) return rv; break; /* ensure the EOL is recorded as whitespace */ } if (store_char(context, ch) != PDF_OK) { /* the comment buffer is full, so split the token */ rv = exit_state(context, token); if (rv != PDF_OK) return rv; context->state = PDF_TOKENISER_STATE_COMMENT; return store_char(context, ch); } return PDF_OK; default: ; } /* now handle delimiters and whitespace */ if (pdf_is_delim_char(ch)) { /* set state 0 (UNINIT), substate 0, bufpos 0 */ rv = exit_state(context, token); if (rv != PDF_OK) return rv; switch (ch) { case 37: /* '%' */ context->state = PDF_TOKENISER_STATE_COMMENT; return store_char(context, ch); case 40: /* '(' */ context->state = PDF_TOKENISER_STATE_STRING; context->intparam = 0; return PDF_OK; case 41: /* ')' */ /* this shouldn't occur outside the STRING and COMMENT states */ return PDF_EBADFILE; case 47: /* '/' */ context->state = PDF_TOKENISER_STATE_NAME; return PDF_OK; case 60: /* '<' */ context->state = PDF_TOKENISER_STATE_HEXSTRING; return PDF_OK; case 62: /* '>' */ context->state = PDF_TOKENISER_STATE_DICTEND; return PDF_OK; case 91: /* '[' */ /* fall through */ case 93: /* ']' */ /* fall through */ case 123: /* '{' */ /* fall through */ case 125: /* '}' */ /* exit_state may have emitted a token, so we can't emit another * one now; we'll do it when exiting the PENDING state */ context->state = PDF_TOKENISER_STATE_PENDING; context->charparam = ch; return PDF_OK; } /* not reached (all delimiter chars should be handled) */ assert(0); } else if (pdf_is_wspace_char(ch)) { if (context->state == PDF_TOKENISER_STATE_WSPACE) { if (store_char(context, ch) == PDF_OK) return PDF_OK; /* otherwise, we'll start a new WS token */ } rv = exit_state(context, token); if (rv != PDF_OK) return rv; context->state = PDF_TOKENISER_STATE_WSPACE; return store_char(context, ch); } /* ch is a regular character */ switch (context->state) { case PDF_TOKENISER_STATE_PENDING: /* fall through */ case PDF_TOKENISER_STATE_WSPACE: rv = exit_state(context, token); /* ch isn't whitespace */ if (rv != PDF_OK) return rv; /* fall through */ case PDF_TOKENISER_STATE_NONE: context->state = PDF_TOKENISER_STATE_KEYWORD; /* fall through */ case PDF_TOKENISER_STATE_KEYWORD: return store_char(context, ch); case PDF_TOKENISER_STATE_NAME: if (context->substate == 0) { if (ch != 35 /* '#' */ || (context->flags & PDF_TOKENISER_FLAG_PDF11) ) return store_char(context, ch); context->substate = 1; return PDF_OK; } if ( (ch = hexval(ch)) == 255 ) return PDF_EBADFILE; if (context->substate == 1) /* the first hex digit of an escape */ { context->substate = 2; context->charparam = ch; return PDF_OK; } ch = (context->charparam << 4) | ch; if (ch == 0) /* the PDF spec forbids "#00" */ return PDF_EBADFILE; rv = store_char(context, ch); if (rv == PDF_OK) context->substate = 0; return rv; default: assert(0); } return store_char(context, ch); } static INLINE int can_store_char(pdf_tokeniser_t context) { return context->buffer->wp < (context->buffer->size - 1); } static INLINE pdf_status_t store_char(pdf_tokeniser_t context, pdf_char_t ch) { if (!can_store_char(context)) return PDF_EIMPLLIMIT; context->buffer->data[context->buffer->wp++] = ch; return PDF_OK; } static pdf_status_t exit_state(pdf_tokeniser_t context, pdf_token_t *token) { pdf_status_t err; pdf_token_t new_token = NULL; pdf_stm_buffer_t buf = NULL; int want_buffer = 0; int toktype; switch (context->state) { case PDF_TOKENISER_STATE_NONE: return PDF_OK; case PDF_TOKENISER_STATE_WSPACE: goto finish; case PDF_TOKENISER_STATE_COMMENT: if (!(context->flags & PDF_TOKENISER_FLAG_RET_COMMENTS)) goto finish; toktype = PDF_TOKEN_TYPE_COMMENT; want_buffer = 1; break; case PDF_TOKENISER_STATE_KEYWORD: //TODO: try to scan as number toktype = PDF_TOKEN_TYPE_KEYWORD; want_buffer = 1; break; case PDF_TOKENISER_STATE_NAME: if (context->substate != 0) /* reading an escape sequence */ return PDF_EBADFILE; toktype = PDF_TOKEN_TYPE_NAME; want_buffer = 1; break; case PDF_TOKENISER_STATE_STRING: if (context->intparam >= 0) /* didn't see the closing ')' */ return PDF_EBADFILE; toktype = PDF_TOKEN_TYPE_STRING; want_buffer = 1; break; case PDF_TOKENISER_STATE_HEXSTRING: if (context->substate != 3) /* didn't see the closing '>' */ return PDF_EBADFILE; toktype = PDF_TOKEN_TYPE_STRING; want_buffer = 1; break; case PDF_TOKENISER_STATE_DICTEND: if (context->substate != 1) /* didn't see a second '>' */ return PDF_EBADFILE; toktype = PDF_TOKEN_TYPE_DICT_END; break; case PDF_TOKENISER_STATE_PENDING: switch (context->charparam) { case 60: /* '<' */ toktype = PDF_TOKEN_TYPE_DICT_START; break; case 91: /* '[' */ toktype = PDF_TOKEN_TYPE_ARRAY_START; break; case 93: /* ']' */ toktype = PDF_TOKEN_TYPE_ARRAY_END; break; case 123: /* '{' */ toktype = PDF_TOKEN_TYPE_PROC_START; break; case 125: /* '}' */ toktype = PDF_TOKEN_TYPE_PROC_END; break; default: assert(0); return PDF_ERROR; } break; default: assert(0); return PDF_ERROR; } err = PDF_ENOMEM; new_token = pdf_alloc(sizeof(*new_token)); if (!new_token) goto fail; new_token->type = toktype; new_token->_priv_flags = 0; if (want_buffer) { err = PDF_ENOMEM; pdf_stm_buffer_t buf = pdf_stm_buffer_new(context->buffer->wp + 1); if (!buf) goto fail; buf->wp = context->buffer->wp; memcpy(buf->data, context->buffer->data, buf->size); buf->data[buf->wp] = 0; new_token->value.buffer = buf; new_token->_priv_flags |= 1; /* need to free buffer */ } *token = new_token; finish: context->state = PDF_TOKENISER_STATE_NONE; context->substate = 0; context->buffer->wp = 0; return PDF_OK; fail: if (new_token) pdf_dealloc(new_token); if (buf) pdf_stm_buffer_destroy(buf); return err; } static INLINE pdf_status_t handle_string_char(pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token) { pdf_status_t rv; start: switch (context->substate) { case 1: /* ignore LF */ context->substate = 0; if (ch == 10) return PDF_OK; /* fall through */ case 0: /* no special state */ if (ch == 92) /* '\\' */ { context->substate = 2; /* start an escape sequence */ return PDF_OK; } else if (ch == 41 && context->intparam <= 0) /* ')'; end of string */ { context->intparam = -1; return exit_state(context, token); } if (!can_store_char(context)) return PDF_EIMPLLIMIT; else if (ch == 40) /* '(' */ ++context->intparam; else if (ch == 41) /* ')' */ --context->intparam; else if (ch == 13) /*CR*/ { ch = 10; /* treat as LF */ context->substate = 1; /* ignore the next char if it's LF */ } return store_char(context, ch); case 2: /* just saw a '\\' (starting an escape sequence) */ context->substate = 0; if (ch == 98) /* 'b' */ ch = 8; /* BS: backspace */ else if (ch == 102) /* 'f' */ ch = 12; /* FF: formfeed */ else if (ch == 110) /* 'n' */ ch = 10; /* NL: newline */ else if (ch == 114) /* 'r' */ ch = 13; /* CR: carriage return */ else if (ch == 116) /* 't' */ ch = 9; /* HT: horizontal tab */ else if (ch == 10) /* NL */ return PDF_OK; /* ignore the line break */ else if (ch == 13) /* CR */ { /* ignore the line break; also ignore the next byte if it's LF */ context->substate = 1; return PDF_OK; } else if (ch >= 48 && ch <= 48+7) /* digits '0'--'7' */ { /* for digits '4'-'7', pretend we saw a leading '0' */ context->substate = (ch < 48+4) ? 3 : 4; context->charparam = (ch-48); return PDF_OK; } /* for any other character, including '(', ')', and '\\', * store the same character (dropping the leading backslash) */ return store_char(context, ch); case 3: /* saw 1 digit of an octal escape */ /* fall through */ case 4: /* saw 2 digits of an octal escape */ if (ch < 48 && ch > 48+7) /* not digits '0'--'7' */ { rv = store_char(context, context->charparam); if (rv != PDF_OK) return rv; /* ch isn't part of the escape sequence, so retry */ context->substate = 0; goto start; } /* ch is a digit from '0'--'7' */ context->charparam = (context->charparam << 3) | (ch - 48); if (context->substate == 4) /* this was the final digit */ { rv = store_char(context, context->charparam); if (rv != PDF_OK) return rv; context->substate = 0; return PDF_OK; } context->substate = 4; return PDF_OK; default: assert(0); } } static INLINE pdf_status_t handle_hexstring_char(pdf_tokeniser_t context, pdf_char_t ch, pdf_token_t *token) { pdf_status_t rv; if (context->substate == 0) { /* this is the first character after the initial '<' */ if (ch == 60) /* '<' */ { /* this was actually the start of a dictionary */ context->state = PDF_TOKENISER_STATE_PENDING; context->charparam = ch; return exit_state(context, token); } context->substate = 1; } if (pdf_is_wspace_char(ch)) return PDF_OK; if (ch == 62) /* '>': end of hex string */ { if (context->substate == 2) { /* the last digit is missing; assume it's '0' */ rv = store_char(context, context->charparam << 4); if (rv != PDF_OK) return rv; } context->substate = 3; /* saw end of string */ return exit_state(context, token); } if ( (ch = hexval(ch)) == 255 ) return PDF_EBADFILE; if (context->substate == 1) /* first character in a pair */ { context->substate = 2; context->charparam = ch; return PDF_OK; } rv = store_char(context, (context->charparam << 4) | ch); if (rv == PDF_OK) context->substate = 1; return rv; } pdf_status_t read_token(pdf_tokeniser_t context, pdf_stm_buffer_t in, pdf_token_t *token, pdf_bool_t finish_p) { pdf_status_t rv; if (!token) return PDF_EBADDATA; *token = NULL; while (!pdf_stm_buffer_eob_p(in)) { rv = handle_char(context, in->data[in->rp], token); if (rv != PDF_OK) return rv; ++in->rp; if (*token) goto ret_token; } if (finish_p) { rv = exit_state(context, token); if (rv != PDF_OK) return rv; else if (*token) goto ret_token; return PDF_EEOF; } return PDF_ENINPUT; ret_token: return PDF_OK; } /* End of pdf-tokeniser.c */
#include <stdio.h>
#include <assert.h>
#include "pdf-tokeniser.h"
#include "pdf-error.h"
void print_token(pdf_token_t token)
{
char *typ, *buf = NULL;
int show_buf = 0, sz;
switch(token->type)
{
case PDF_TOKEN_TYPE_WSPACE:
typ = "WSPACE";
break;
case PDF_TOKEN_TYPE_COMMENT:
typ = "COMMENT";
show_buf = 1;
break;
case PDF_TOKEN_TYPE_KEYWORD:
typ = "KEYWORD";
show_buf = 1;
break;
case PDF_TOKEN_TYPE_INTEGER://TODO
typ = "INTEGER";
break;
case PDF_TOKEN_TYPE_REAL://TODO
typ = "REAL";
break;
case PDF_TOKEN_TYPE_NAME:
typ = "NAME";
show_buf = 1;
break;
case PDF_TOKEN_TYPE_STRING:
typ = "STRING";
show_buf = 1;
break;
case PDF_TOKEN_TYPE_DICT_START:
typ = "DICT_START";
break;
case PDF_TOKEN_TYPE_DICT_END:
typ = "DICT_END";
break;
case PDF_TOKEN_TYPE_ARRAY_START:
typ = "ARRAY_START";
break;
case PDF_TOKEN_TYPE_ARRAY_END:
typ = "ARRAY_END";
break;
case PDF_TOKEN_TYPE_PROC_START:
typ = "PROC_START";
break;
case PDF_TOKEN_TYPE_PROC_END:
typ = "PROC_END";
break;
default:
typ = "(unknown)";
}
buf = show_buf ? (char*)token->value.buffer->data : "(no text)";
sz = show_buf ? token->value.buffer->wp : -1;
printf("token %s(%d): %s\n", typ, sz, buf);
};
void print_file(FILE *file)
{
pdf_status_t rv;
pdf_tokeniser_t tokeniser;
pdf_stm_buffer_t buffer;
pdf_token_t token;
rv = pdf_tokeniser_new(&tokeniser);
assert(rv == PDF_OK);
buffer = pdf_stm_buffer_new(4096);
assert(buffer);
while (!feof(file))
{
buffer->rp = 0;
buffer->wp = fread(buffer->data, 1, buffer->size, file);
if (ferror(file))
{
perror("fread");
break;
}
while (!pdf_stm_buffer_eob_p(buffer))
{
rv = read_token(tokeniser, buffer, &token, feof(file));
if (rv == PDF_OK)
print_token(token);
else if (rv == PDF_ENINPUT || rv == PDF_EEOF)
;
else
{
printf("read_token error %d\n", rv);
goto out;
}
}
}
out:
printf("done\n");
}
int main(int argc, char **argv)
{
print_file(stdin);
return 0;
}
CFLAGS=-Dpdf_alloc=malloc -Dpdf_dealloc=free \ -I$(HOME)/src/gnupdf-trunk/src/base/ -I. -Wall all: toktest toktest: pdf-tokeniser.o pdf-stm-buffer.o toktest.o pdf-tokeniser.o: pdf-tokeniser.c pdf-tokeniser.h toktest.o: toktest.c pdf-tokeniser.h
signature.asc
Description: Digital signature
