Here is a patch with a better fix for the apostrophe problem.  Again it 
includes acceptance of CRLF as an end of line character.

julia
diff --git a/parsing_c/lexer_c.mll b/parsing_c/lexer_c.mll
index 12f42d5..ab777cf 100644
--- a/parsing_c/lexer_c.mll
+++ b/parsing_c/lexer_c.mll
@@ -348,19 +348,19 @@ rule token = parse
    * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
    *)
 
-  | "#" spopt "pragma"  sp  [^'\n']* '\n'
-  | "#" spopt "ident"   sp  [^'\n']* '\n'
-  | "#" spopt "line"    sp  [^'\n']* '\n'
-  | "#" spopt "error"   sp  [^'\n']* '\n'
-  | "#" spopt "warning" sp  [^'\n']* '\n'
-  | "#" spopt "abort"   sp  [^'\n']* '\n'
+  | "#" spopt "pragma"  sp  [^'\n' '\r']* ('\n' | "\r\n")
+  | "#" spopt "ident"   sp  [^'\n' '\r']* ('\n' | "\r\n")
+  | "#" spopt "line"    sp  [^'\n' '\r']* ('\n' | "\r\n")
+  | "#" spopt "error"   sp  [^'\n' '\r']* ('\n' | "\r\n")
+  | "#" spopt "warning" sp  [^'\n' '\r']* ('\n' | "\r\n")
+  | "#" spopt "abort"   sp  [^'\n' '\r']* ('\n' | "\r\n")
       { TCppDirectiveOther (tokinfo lexbuf) }
 
-  | "#" [' ' '\t']* '\n'
+  | "#" [' ' '\t']* ('\n' | "\r\n")
       { TCppDirectiveOther (tokinfo lexbuf) }
 
   (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
-  | "#" sp pent sp  '"' [^ '"']* '"' (spopt pent)*  spopt '\n'
+  | "#" sp pent sp  '"' [^ '"']* '"' (spopt pent)*  spopt ('\n' | "\r\n")
       { TCppDirectiveOther (tokinfo lexbuf) }
 
 
@@ -426,7 +426,7 @@ rule token = parse
       }
 
  (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { 
*)
-  | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" 
[^'\n']* '\n'
+  | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" 
[^'\n' '\r']* ('\n' | "\r\n")
       { let info = tokinfo lexbuf in
         TIfdefMisc (false, no_ifdef_mark(), info)
       }
@@ -521,7 +521,7 @@ rule token = parse
   (* can be at eof *)
   (*| "#" [' ' '\t']* "endif"                { TEndif     (tokinfo lexbuf) }*)
 
-  | "#" [' ' '\t']* "else" [' ' '\t' '\n']
+  | "#" [' ' '\t']* "else" ([' ' '\t' '\n'] | "\r\n")
       { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
 
 
@@ -532,7 +532,7 @@ rule token = parse
   (* ---------------------- *)
 
   (* only in cpp directives normally *)
-  | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
+  | "\\" ('\n' | "\r\n") { TCppEscapedNewline (tokinfo lexbuf) }
 
   (* We must generate separate tokens for #, ## and extend the grammar.
    * Note there can be "elaborated" idents in many different places, in
@@ -860,14 +860,14 @@ rule token = parse
 
 (*****************************************************************************)
 and char = parse
-  | (_ as x)                                    "'"  { String.make 1 x }
+  | (_ as x)                           { String.make 1 x ^ restchars lexbuf }
   (* todo?: as for octal, do exception  beyond radix exception ? *)
-  | (("\\" (oct | oct oct | oct oct oct)) as x  "'") { x }
+  | (("\\" (oct | oct oct | oct oct oct)) as x     ) { x ^ restchars lexbuf }
   (* this rule must be after the one with octal, lex try first longest
    * and when \7  we want an octal, not an exn.
    *)
-  | (("\\x" ((hex | hex hex))) as x        "'")      { x }
-  | (("\\" (_ as v))           as x        "'")
+  | (("\\x" ((hex | hex hex))) as x           )      { x ^ restchars lexbuf }
+  | (("\\" (_ as v))           as x           )
        {
           (match v with (* Machine specific ? *)
           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
@@ -877,13 +877,38 @@ and char = parse
          | _ ->
               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
          );
-          x
+          x ^ restchars lexbuf
        }
   | _
       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
-        tok lexbuf
+        tok lexbuf ^ restchars lexbuf
       }
 
+and restchars = parse
+  | "'"                                { "" }
+  | (_ as x)                           { String.make 1 x ^ restchars lexbuf }
+  (* todo?: as for octal, do exception  beyond radix exception ? *)
+  | (("\\" (oct | oct oct | oct oct oct)) as x     ) { x ^ restchars lexbuf }
+  (* this rule must be after the one with octal, lex try first longest
+   * and when \7  we want an octal, not an exn.
+   *)
+  | (("\\x" ((hex | hex hex))) as x           )      { x ^ restchars lexbuf }
+  | (("\\" (_ as v))           as x           )
+       {
+          (match v with (* Machine specific ? *)
+          | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
+          | 'f' -> () | 'a' -> ()
+         | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
+          | 'e' -> () (* linuxext: ? *)
+         | _ ->
+              pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
+         );
+          x ^ restchars lexbuf
+       }
+  | _
+      { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
+        tok lexbuf ^ restchars lexbuf
+      }
 
 
 (*****************************************************************************)
@@ -965,7 +990,7 @@ and cpp_eat_until_nl = parse
   (* noteopti:
    * update: need also deal with comments chars now
    *)
-  | [^ '\n' '\\'      '/' '*'  ]+
+  | [^ '\n' '\r' '\\'      '/' '*'  ]+
      { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
   | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
   | _   { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
_______________________________________________
Cocci mailing list
[email protected]
http://lists.diku.dk/mailman/listinfo/cocci
(Web access from inside DIKUs LAN only)

Reply via email to