# try to follow the specification # Extensible Markup Language (XML) 1.0 (Fifth Edition) # W3C Recommendation 26 November 2008 # http://www.w3.org/TR/2008/REC-xml-20081126/ # # It's "only" an XML grammar: # most of the Well-formedness constraints and validation constraints are not implemented # # Rakudo limitation: # - UTF-8: \x hexadecimal notation newly working in caracter class but not for values > \x100 # - builtin rules overloading like not yet working correctly, # so only using "token" with and no "rule" # # Brain limitation: # - Read S05 # - learn more about backtracking # - learn more about XML # # Question - What is the elegant way of doing such things? # CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) # PI ::= '' Char*)))? '?>' # use v6; grammar XML::Grammar { # [1] document ::= prolog element Misc* token TOP { ^ * $ }; #token TOP { ^ ? * $ }; # rule TOP { ^ ? [ $ || <.panic: 'Syntax error'> ] }; # [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] #token charact { [ \s | <[a..b A..B]> ] }; # token charact { \x9| \xA | \xD | \x20 | <[a..zA..Z0..9]> }; token charact { [ | \x9 # TAB | \xA # LF | \xD # CR # | <[\x[20]..\x[D7FF]]> # | <[\xE000..\xFFFD]> # | <[\x10000..\x10FFFF]> | \x20 # SP | \x3C # < | \x3E # > | \x3F # ? | <[a..zA..Z0..9]> ] }; # [3] S ::= (#x20 | #x9 | #xD | #xA)+ token space { [ \x20 | \x9 | \xD | \xA ]+ } # old protocol # [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender # token name_character { <+lett+digi+[\.\-_:]> }; # [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] token name_start_character { [ | <[A..Za..z:_]> # | <[\xC0..\xD6]> # | <[\xD8..\xF6]> # | <[\xF8..\x2FF]> # | <[\x370..\x37D]> # | <[\x37F..\x1FFF]> # | <[\x200C..\x200D]> # | <[\x2070..\x218F]> # | <[\x2C00..\x2FEF]> # | <[\x3001..\xD7FF]> # | <[\xF900..\xFDCF]> # | <[\xFDF0..\xFFFD]> # | <[\x10000..\xEFFFF]> ] }; # [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] token name_character { [ | | <[0..9\-.]> | \xB7 # | <[\x0300..\x036F]> # | <[\x203F..\x2040]> ] }; # old protocol # [5] Name ::= (Letter | '_' | ':') (NameChar)* # token name { <+lett+[_:]> * }; # [5] Name ::= NameStartChar (NameChar)* token name { * }; # [6] Names ::= Name (#x20 Name)* token names { [ \x20 ]* }; # [7] Nmtoken ::= (NameChar)+ token nmtoken { + }; # [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* token nmtoken { [ \x20 ]* }; # [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' # | "'" ([^%&'] | PEReference | Reference)* "'" token entity_value { <[']> [ <-['%&]> | | ]* <[']> | '"' [ <-["%&]> | | ]* '"' }; # [10] AttValue ::= '"' ([^<&"] | Reference)* '"' # | "'" ([^<&'] | Reference)* "'" # token attr_value { '"' [ <-["<&]>*? | ] '"' | <[']> [ <-['<&]>* | ] <[']> }; token attr_value { (<['"]>) [ <-[<&]> | ]*? $0 }; # [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") # token system_literal { (<['"]>) .*? $0 }; token system_literal { '"' <-["]>*? '"' | <[']> <-[']>*? <[']> }; # [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" # token public_id_literal { [ \' <-[']>* \' | '"' <-["]>* '"' ] }; token public_id_literal { (<['"]>) *? $0 }; # [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] # token public_id_character { \s | <[a..zA..Z0..9]> | <[\-./]> }; token public_id_character { \x20| \xD | \xA | <[a..zA..Z0..9]> | <[\-'()+,./:=?;!*#@$_%]> }; # [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) # token character_data { <-[<&]>* }; #FIXME token character_data { <-[<>&]> }; #FIXME # Comments # [15] Comment ::= '' # token comment { '' }; token comment { '' }; # token comment { # < ''> # } # Processing Instructions # [16] PI ::= '' Char*)))? '?>' # [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) # token processing_instruction { ' [ *? ]? '?>' }; token processing_instruction { ' ] [ *? ]? '?>' }; # token processing_instruction { ' ] [ [ <-[\?]+charact> | '?' <-[>]+charact> ]* ]? '?>' }; # CDATA Sections # [18] CDSect ::= CDStart CData CDEnd # [19] CDStart ::= '' Char*)) # [21] CDEnd ::= ']]>' # token character_data_section { '' }; token character_data_section { ' | ']' <-[\]]+charact> | ']]' <-[\>]+charact> ]*? ']]>' }; # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? token prolog { ? * [ * ]? }; # [23] XMLDecl ::= '' token xml_declaration { ' ? ? ? '?>' }; # [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') token version_info { 'version' (<["']>) $0 }; # [25] Eq ::= S? '=' S? token eq { ? '=' ? }; # old protocol # [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ # [26] VersionNum ::= '1.' [0-9]+ # token version_info { 'version' '=' (<["']>) '1.0' $0 }; token version_nummer { '1.' \d+ }; # [27] Misc ::= Comment | PI | S token misc { [ | | ] }; # [28] doctypedecl ::= '' [ VC: Root Element Type ] # token doctype_declaration { ' [ ]? ? [ '[' [ | | ]* ']' ? ]? '>' }; # [28] doctypedecl ::= '' [VC: Root Element Type] token doctype_declaration { ' [ ]? ? [ '[' ']' ? ]? '>' }; # [28a] DeclSep ::= PEReference | S [WFC: PE Between Declarations] # [28b] intSubset ::= (markupdecl | DeclSep)* token internal_subset { [ | ]* }; token declaration_separator { | }; # [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment token markup_declaration { [ | | | | | | ] }; # External Subset # [30] extSubset ::= TextDecl? extSubsetDecl # [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* token external_subset { ? }; token external_subset_declaration { [ | | ]* }; # [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) [ VC: Standalone Document Declaration ] token standalone_declaration { 'standalone' (<['"]>) [yes|no] $0 }; # [33] [34] [35] [36] [37] [38] deleted from the official specification # [39] element ::= EmptyElemTag | STag content ETag # [40] STag ::= '<' Name (S Attribute)* S? '>' # [42] ETag ::= '' # [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [ WFC: Unique Att Spec ] token element { '<' () [ ]* ? [ | '/>' # a single tag | '>' '? '>' # an opening and a closing tag ] }; # [41] Attribute ::= Name Eq AttValue token attribute { }; # [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)* # Old Specification # the XML Specification does not seem to explain why this early specification change was done # I suppose that it is only to make the difference clear between chardata and markup # [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* token content { ? [ [ | | | | | | ] ? ]* }; # [45] elementdecl ::= '' [VC: Unique Element Type Declaration] # [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children token element_declaration { ' ? '>' }; token content_spec { 'EMPTY' | 'ANY' | | }; # [47] children ::= (choice | seq) ('?' | '*' | '+')? # [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? # [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting] # [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' [VC: Proper Group/PE Nesting] token children { [ | ] [ '?' | '*' | '+' ]? }; token content_particle { [ | | ] [ '?' | '*' | '*' ]? }; token choice { '(' ? [ ? '|' ? ]+ ? ')' }; token sequence { '(' ? [ ? ',' ? ]* ? ')' }; # [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' # [VC: Proper Group/PE Nesting] # [VC: No Duplicate Types] token mixed { [ | '(' ? '#PCDATA' [? '|' ? ]* ? ')*' | '(' ? '#PCDATA' ? ')' ] }; # [52] AttlistDecl ::= '' # [53] AttDef ::= S Name S AttType S DefaultDecl token attribute_list_declaration { ' * ? '>' }; token attribute_definition { }; # [54] AttType ::= StringType | TokenizedType | EnumeratedType # [55] StringType ::= 'CDATA' # [56] TokenizedType ::= 'ID' [VC: ID] # [VC: One ID per Element Type] # [VC: ID Attribute Default] # | 'IDREF' [VC: IDREF] # | 'IDREFS' [VC: IDREF] # | 'ENTITY' [VC: Entity Name] # | 'ENTITIES' [VC: Entity Name] # | 'NMTOKEN' [VC: Name Token] # | 'NMTOKENS' [VC: Name Token] token attribute_type { | | }; token string_type { 'CDATA' }; token tokenized_type { [ | 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' ] }; # [57] EnumeratedType ::= NotationType | Enumeration # [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' [VC: Notation Attributes] # [VC: One Notation Per Element Type] # [VC: No Notation on Empty Element] # [VC: No Duplicate Tokens] # [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] # [VC: No Duplicate Tokens] token enumerated_type { | }; token notation_type { 'NOTATION' '(' ? (? '|' ? )* ? ')' }; token enumeration { '(' ? [ ? '|' ? ]* ? ')' }; # [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' # | (('#FIXED' S)? AttValue) [VC: Required Attribute] token default_declaration { [ | '#REQUIRED' | '#IMPLIED' | [[ '#FIXED' ]? ] ] }; # [61] conditionalSect ::= includeSect | ignoreSect # [62] includeSect ::= '' [VC: Proper Conditional Section/PE Nesting] # [63] ignoreSect ::= '' [VC: Proper Conditional Section/PE Nesting] # [64] ignoreSectContents ::= Ignore ('' Ignore)* # [65] Ignore ::= Char* - (Char* ('') Char*) token conditional_section { | }; token include_section { '? 'INCLUDE' ? '[' ']]>' }; token ignore_section { '? 'IGNORE' ? '[' * ']]>' }; token ignore_section_contents { [ ' ']]>' ]* }; token ignore { [ [ <-[\]]+charact> | ']' <-[\]]+charact> | ']]' <-[\>]+charact>] | [ <-[\<]+charact> | '<' <-[\!]+charact> | ' ] ]*? }; # [67] Reference ::= EntityRef | CharRef token reference { | }; # [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' token char_reference { '&' [ | '#' \d+ | '#x' <[ 0..9 a..f A..F ]>+ ] ';' }; # [68] EntityRef ::= '&' Name ';' token entity_reference { '&' ';' }; # [69] PEReference ::= '%' Name ';' token param_entity_reference { '%' ';' }; # [70] EntityDecl ::= GEDecl | PEDecl #token entity_declaration { [ | ] }; token entity_declaration { [ | ] }; # [71] GEDecl ::= '' token general_entity_declaration { ' ? '>' }; # [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) token entity_definition { [ | [ ? ] ] }; # [72] PEDecl ::= '' token param_entity_declaration { ' '%' ? '>' }; # [74] PEDef ::= EntityValue | ExternalID token param_entity_definition { [ | ] }; # [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral token external_id { ['SYSTEM' | 'PUBLIC' ] }; # [76] NDataDecl ::= S 'NDATA' S Name token ndata_declaration { 'NDATA' }; # [78] extParsedEnt ::= TextDecl? content token extern_parse_entity { ? }; # [77] TextDecl ::= '' token text_declaration { '? ? '?>' }; # [78] deleted from specification # [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) token encoding_declaration { 'encoding' (<['"]>) $0 }; # [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* token encoding_name { <[A..Z a..z]> <[A..Z a..z 0..9 ._\- ]>* }; # Notation Declarations # [82] NotationDecl ::= '' # [83] PublicID ::= 'PUBLIC' S PubidLiteral token notation_declaration { ' [ | 'PUBLIC' ] ? '>' }; # [84] Letter ::= BaseChar | Ideographic # token lett { <[a..zA..Z]> }; #TODO Unicode letter predefined? # [88] Digit ::= [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29] # token digi { \d }; #TODO Unicode digit predifined? }; 1;