
# try to follow the specification
# Extensible Markup Language (XML) 1.0 (Fifth Edition)
# W3C Recommendation 26 November 2008
# http://www.w3.org/TR/2008/REC-xml-20081126/
# 
# It's "only" an XML grammar:
# most of the Well-formedness constraints and validation constraints are not implemented
#
# Rakudo limitation:
# - UTF-8: \x hexadecimal notation newly working in caracter class but not for values > \x100
# - builtin rules overloading like <ws> not yet working correctly,
#   so only using "token" with <space> and no "rule"
#
# Brain limitation:
# - Read S05
# - learn more about backtracking
# - learn more about XML
# 
# Question - What is the elegant way of doing such things?
#  CharData     ::=    [^<&]* - ([^<&]* ']]>' [^<&]*)
#  PI   ::=    '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'

# use v6;
grammar XML::Grammar {

# [1] 	document 	 ::= 	prolog element Misc*
    token TOP   { ^ <prolog> <root_element=element> <misc>* $ };
    #token TOP   { ^ <prolog>? <element> <misc>* $ };
    # rule TOP   { ^ <prolog>? <root_element> [ $ || <.panic: 'Syntax error'> ] };

# [2] 	Char 	 ::= 	#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
    #token charact { [ \s | <[a..b A..B]> ] };
    # token charact { \x9| \xA | \xD | \x20 | <[a..zA..Z0..9]> };
    token charact { [
                         | \x9  # TAB
                         | \xA  # LF
                         | \xD  # CR
#                         | <[\x[20]..\x[D7FF]]>
#                         | <[\xE000..\xFFFD]>
#                         | <[\x10000..\x10FFFF]>
                         | \x20 # SP
                         | \x3C # <
                         | \x3E # >
                         | \x3F # ?
                         | <[a..zA..Z0..9]>
                       ] 
    };

# [3] 	S 	 ::= 	(#x20 | #x9 | #xD | #xA)+
    token space { [ \x20 | \x9 | \xD | \xA ]+ }

# old protocol
# [4] 	NameChar 	 ::= 	Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
#      token name_character { <+lett+digi+[\.\-_:]> }; 

# [4]   	NameStartChar	   ::=   	":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
    token name_start_character { [ 
                                    | <[A..Za..z:_]>
#                                    | <[\xC0..\xD6]>
#                                    | <[\xD8..\xF6]>
#                                    | <[\xF8..\x2FF]>
#                                    | <[\x370..\x37D]>
#                                    | <[\x37F..\x1FFF]>
#                                    | <[\x200C..\x200D]>
#                                    | <[\x2070..\x218F]>
#                                    | <[\x2C00..\x2FEF]>
#                                    | <[\x3001..\xD7FF]>
#                                    | <[\xF900..\xFDCF]>
#                                    | <[\xFDF0..\xFFFD]>
#                                    | <[\x10000..\xEFFFF]>
                                    ]
    };

# [4a]   	NameChar	   ::=   	NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
    token name_character { [ 
                              | <name_start_character>
                              | <[0..9\-.]>
                              | \xB7
#                              | <[\x0300..\x036F]>
#                              | <[\x203F..\x2040]>
                             ]
    };  

# old protocol
# [5] 	Name 	 ::= 	(Letter | '_' | ':') (NameChar)*
#      token name { <+lett+[_:]> <name_character>* }; 

# [5]   	Name	   ::=   	NameStartChar (NameChar)*
    token name { <name_start_character> <name_character>* };

# [6]   	Names	   ::=   	Name (#x20 Name)*
    token names { <name> [ \x20 <name> ]* };

# [7] 	Nmtoken 	 ::= 	(NameChar)+
    token nmtoken { <name_character>+ };

# [8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
    token nmtoken { <nmtoken> [ \x20 <nmtoken> ]* };   

# [9] 	EntityValue 	 ::= 	'"' ([^%&"] | PEReference | Reference)* '"'
#			|  "'" ([^%&'] | PEReference | Reference)* "'"
    token entity_value { <[']> [ <-['%&]> | <param_entity_reference> | <reference> ]* <[']> | '"' [ <-["%&]> | <param_entity_reference> | <reference> ]* '"' };

# [10] 	AttValue 	 ::= 	'"' ([^<&"] | Reference)* '"'
#			|  "'" ([^<&'] | Reference)* "'"
    # token attr_value { '"' [ <-["<&]>*? | <reference> ] '"' | <[']> [ <-['<&]>* | <reference> ] <[']> };
    token attr_value { (<['"]>) [ <-[<&]> | <reference> ]*? $0 }; 

# [11] 	SystemLiteral 	 ::= 	('"' [^"]* '"') | ("'" [^']* "'") 
    # token system_literal { (<['"]>) .*? $0 };
    token system_literal { '"' <-["]>*? '"' | <[']> <-[']>*? <[']> };

# [12] 	PubidLiteral 	 ::= 	'"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
#   token public_id_literal { [ \' <-[']>* \' | '"' <-["]>* '"' ] };
    token public_id_literal { (<['"]>) <public_id_character>*? $0 }; 

# [13] 	PubidChar 	::= 	#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
#   token public_id_character { \s | <[a..zA..Z0..9]> | <[\-./]> };
    token public_id_character { \x20| \xD | \xA | <[a..zA..Z0..9]> | <[\-'()+,./:=?;!*#@$_%]> };

# [14] 	CharData 	 ::= 	[^<&]* - ([^<&]* ']]>' [^<&]*)
#   token character_data { <-[<&]>*  }; #FIXME
    token character_data { <-[<>&]> }; #FIXME

# Comments
# [15] 	Comment 	 ::= 	'<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   #   token comment { '<!--' .*? '-->' };
    token comment { '<!--' [ <-[\-]+charact> | '-' <-[\-]+charact> ]*? '-->' };

# token comment {
#     < '<!--' [[<charact> <!after '-'>] | ['-' [<charact> <!after '-'>]]]* '-->'>
# }


# Processing Instructions
# [16] 	PI 	 ::= 	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
# [17] 	PITarget 	::= 	Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
   # token processing_instruction { '<?' <name> [ <space> <charact>*? ]? '?>' };
    token processing_instruction { '<?' [ <!before :i'xml'> <name> ] [ <space> <charact>*? ]? '?>' };
   # token processing_instruction { '<?' [ <!before :i'xml'> <name> ] [ <space> [ <-[\?]+charact> | '?' <-[>]+charact> ]* ]? '?>' };

# CDATA Sections
# [18] 	CDSect 	::= 	CDStart CData CDEnd
# [19] 	CDStart 	::= 	'<![CDATA['
# [20] 	CData 	::= 	(Char* - (Char* ']]>' Char*))
# [21] 	CDEnd 	::= 	']]>'
    # token character_data_section { '<![CDATA[' .*? ']]>' };
    token character_data_section { '<![CDATA[' [ <-[\]]+charact> | ']' <-[\]]+charact> | ']]' <-[\>]+charact> ]*? ']]>' };

# [22] 	prolog 	 ::= 	XMLDecl? Misc* (doctypedecl Misc*)?
    token prolog { <xml_declaration>? <misc>* [ <doctype_declaration> <misc>* ]? };

# [23] 	XMLDecl 	 ::= 	'<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    token xml_declaration { '<?xml' <version_info> <encoding_declaration>? <standalone_declaration>? <space>? '?>' };

# [24]   	VersionInfo	   ::=   	 S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
    token version_info {  <space> 'version' <eq> (<["']>) <version_nummer> $0 };

# [25] 	Eq 	 ::= 	S? '=' S?
    token eq { <space>? '=' <space>? };

# old protocol
# [26] 	VersionNum 	 ::= 	([a-zA-Z0-9_.:] | '-')+

# [26]   	VersionNum	   ::=   	'1.' [0-9]+
    # token version_info {  'version' '=' (<["']>) '1.0' $0 };
    token version_nummer { '1.' \d+ };

# [27] 	Misc 	 ::= 	Comment | PI |  S
    token misc { [ <comment> | <processing_instruction> | <space> ] }; 

# [28] 	doctypedecl 	 ::= 	'<!DOCTYPE' S Name (S ExternalID)? S? ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 	 [ 	VC: Root Element Type ]
#   token doctype_declaration {  '<!DOCTYPE' <space> <name> [ <space> <external_id> ]? <space>? [ '[' [ <markup_declaration> | <param_entity_reference> | <space> ]* ']' <space>? ]? '>' }; 

# [28]   	doctypedecl	   ::=   	'<!DOCTYPE' S  Name (S  ExternalID)? S? ('[' intSubset ']' S?)? '>'	[VC: Root Element Type]
    token doctype_declaration {  '<!DOCTYPE' <space> <name> [ <space> <external_id> ]? <space>? [ '[' <internal_subset> ']' <space>? ]? '>' };

# [28a]   	DeclSep	   ::=   	PEReference | S 	[WFC: PE Between Declarations]
# [28b]   	intSubset	   ::=   	(markupdecl | DeclSep)*
    token internal_subset { [ <markup_declaration> | <declaration_separator> ]* };
    token declaration_separator { <param_entity_reference> | <space> };

# [29] 	markupdecl 	 ::= 	elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment 
    token markup_declaration { [
                                   | <element_declaration>
                                   | <attribute_list_declaration>
                                   | <comment>
                                   | <processing_instruction>
                                   | <entity_declaration>
                                   | <notation_declaration>
                                 ]
    };

# External Subset
# [30]   	extSubset	   ::=   	TextDecl? extSubsetDecl
# [31]   	extSubsetDecl	   ::=   	( markupdecl | conditionalSect | DeclSep)*
    token external_subset { <text_declaration>? <external_subset_declaration> };
    token external_subset_declaration { [ <markup_declaration> | <conditional_section> | <declaration_separator> ]* };

# [32] 	SDDecl 	 ::= 	S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) 	 [ 	VC: Standalone Document Declaration ]
    token standalone_declaration { <space> 'standalone' <eq> (<['"]>) [yes|no] $0 };

# [33] [34] [35] [36] [37] [38] deleted from the official specification

# [39] 	element 	 ::= 	EmptyElemTag | STag content ETag
# [40] 	STag 	 ::= 	'<' Name (S Attribute)* S? '>'
# [42] 	ETag 	 ::= 	'</' Name S? '>'
# [44] 	EmptyElemTag 	 ::= 	'<' Name (S Attribute)* S? '/>' 	 [ 	WFC: Unique Att Spec ]
    token element   {
           '<' (<name>) [ <space> <attribute> ]* <space>?
          [
               | '/>'                 # a single tag
               | '>' <content> '</' $0 <space>? '>'  # an opening and a closing tag
          ]
    };

# [41] 	Attribute 	 ::= 	Name Eq AttValue
    token attribute { <name> <eq> <attr_value> };


# [43] 	content 	 ::= 	(element | CharData | Reference | CDSect | PI | Comment)*
# Old Specification
# the XML Specification does not seem to explain why this early specification change was done
# I suppose that it is only to make the difference clear between chardata and markup 
# [43]   	content	   ::=   	 CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
    token content {
         <character_data>?
         [
            [
               | <comment>
               | <processing_instruction>
               | <element>
               | <character_data>
               | <reference>
               | <character_data_section>
            ]    <character_data>?
         ]*
    };

# [45]   	elementdecl	   ::=   	'<!ELEMENT' S  Name  S  contentspec  S? '>'	[VC: Unique Element Type Declaration]
# [46]   	contentspec	   ::=   	'EMPTY' | 'ANY' | Mixed | children 
    token element_declaration { '<!ELEMENT' <space> <name> <space> <content_spec> <space>? '>' };
    token content_spec { 'EMPTY' | 'ANY' | <mixed> | <children> };

# [47]   	children	   ::=   	(choice | seq) ('?' | '*' | '+')?
# [48]   	cp	   ::=   	(Name | choice | seq) ('?' | '*' | '+')?
# [49]   	choice	   ::=   	'(' S? cp ( S? '|' S? cp )+ S? ')'	[VC: Proper Group/PE Nesting]
# [50]   	seq	   ::=   	'(' S? cp ( S? ',' S? cp )* S? ')'	[VC: Proper Group/PE Nesting]
    token children { [ <choice> | <sequence> ] [ '?' | '*' | '+' ]? };
    token content_particle { [ <name> | <choice> | <sequence> ] [ '?' | '*' | '*' ]? };
    token choice { '(' <space>? <content particle> [ <space>? '|' <space>? <content particle> ]+ <space>? ')' };
    token sequence { '(' <space>? <content particle> [ <space>? ',' <space>? <content particle> ]* <space>? ')' };

# [51]   	Mixed	   ::=   	'(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' #	[VC: Proper Group/PE Nesting]
#			[VC: No Duplicate Types]
    token mixed { [
                    | '(' <space>? '#PCDATA' [<space>? '|' <space>? <name>]* <space>? ')*'
                    | '(' <space>? '#PCDATA' <space>? ')'
                  ]
   };

# [52]   	AttlistDecl	   ::=   	'<!ATTLIST' S  Name  AttDef* S? '>'
# [53]   	AttDef	   ::=   	S Name S AttType S DefaultDecl 
    token attribute_list_declaration { '<!ATTLIST' <space>  <name> <attribute_definition>* <space>? '>' };
    token attribute_definition { <space> <name> <space> <attribute_type> <space>  <default_declaration> };

# [54]   	AttType	   ::=   	 StringType | TokenizedType | EnumeratedType
# [55]   	StringType	   ::=   	'CDATA'
# [56]   	TokenizedType	   ::=   	'ID'	[VC: ID]
#				[VC: One ID per Element Type]
#				[VC: ID Attribute Default]
#			| 'IDREF'	[VC: IDREF]
#			| 'IDREFS'	[VC: IDREF]
#			| 'ENTITY'	[VC: Entity Name]
#			| 'ENTITIES'	[VC: Entity Name]
#			| 'NMTOKEN'	[VC: Name Token]
#			| 'NMTOKENS'	[VC: Name Token]
    token attribute_type { <string_type> | <tokenized_type> | <enumerated_type> };
    token string_type { 'CDATA' };
    token tokenized_type { [
                             | 'ID'
                             | 'IDREF'
                             | 'IDREFS'
                             | 'ENTITY'
                             | 'ENTITIES'
                             | 'NMTOKEN'
                             | 'NMTOKENS'
                           ]
    };

# [57]   	EnumeratedType	   ::=   	 NotationType | Enumeration
# [58]   	NotationType	   ::=   	'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' 	[VC: Notation Attributes]
#				[VC: One Notation Per Element Type]
#				[VC: No Notation on Empty Element]
#				[VC: No Duplicate Tokens]
# [59]   	Enumeration	   ::=   	'(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'	[VC: Enumeration]
#				[VC: No Duplicate Tokens]
    token enumerated_type { <notation_type> | <enumeration> };
    token notation_type {  'NOTATION' <space> '(' <space>? <name> (<space>? '|' <space>? <name>)* <space>? ')' };
    token enumeration  { '(' <space>? <nmtoken> [ <space>? '|' <space>? <nmtoken> ]* <space>? ')' };

# [60]   	DefaultDecl	   ::=   	'#REQUIRED' | '#IMPLIED'
#			| (('#FIXED' S)? AttValue)	[VC: Required Attribute]
    token default_declaration { [
                                  | '#REQUIRED'
                                  | '#IMPLIED'
                                  | [[ '#FIXED' <space> ]? <attr_value> ]
                                ]
    };

# [61]   	conditionalSect	   ::=   	 includeSect | ignoreSect
# [62]   	includeSect	   ::=   	'<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' 	[VC: Proper Conditional Section/PE Nesting]
# [63]   	ignoreSect	   ::=   	'<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'	[VC: Proper Conditional Section/PE Nesting]
# [64]   	ignoreSectContents	   ::=   	Ignore ('<![' ignoreSectContents ']]>' Ignore)*
# [65]   	Ignore	   ::=   	Char* - (Char* ('<![' | ']]>') Char*) 
    token conditional_section { <include_section> | <ignore_section> };
    token include_section { '<![' <space>? 'INCLUDE' <space>? '[' <external_subset_declaration> ']]>' };
    token ignore_section {  '<![' <space>? 'IGNORE' <space>? '[' <ignore_section_contents>* ']]>' };
    token ignore_section_contents {  <ignore> [ '<![' <ignore_section_contents> ']]>' <ignore>]* };
    token ignore { [ [ <-[\]]+charact> | ']' <-[\]]+charact> | ']]' <-[\>]+charact>] | [ <-[\<]+charact> | '<' <-[\!]+charact> | '<!' <-[\[]+charact> ] ]*? };

# [67] 	Reference 	 ::= 	EntityRef | CharRef
    token reference { <char_reference> | <entity_reference> };

# [66] 	CharRef 	 ::= 	'&#' [0-9]+ ';'	| '&#x' [0-9a-fA-F]+ ';'
    token char_reference {
            '&' [ 
                    | '#' \d+
                    | '#x' <[ 0..9 a..f A..F ]>+
                ] ';'
      };

# [68] 	EntityRef 	 ::= 	'&' Name ';'
    token entity_reference { '&' <name> ';' }; 

# [69] 	PEReference 	 ::= 	'%' Name ';'
    token param_entity_reference { '%' <name> ';' };

# [70] 	EntityDecl 	 ::= 	GEDecl | PEDecl
    #token entity_declaration { [ <param_entity_declaration> | <general_entity_declaration> ] };
    token entity_declaration { [ <general_entity_declaration> | <param_entity_declaration> ] };

# [71] 	GEDecl 	::= 	'<!ENTITY' S Name S EntityDef S? '>'
    token general_entity_declaration { '<!ENTITY' <space> <name> <space> <entity_definition> <space>? '>' };

# [73] 	EntityDef 	::= 	EntityValue | (ExternalID NDataDecl?)
    token entity_definition { [ <entity_value> | [ <external_id> <ndata_declaration>? ] ] };

# [72] 	PEDecl 	::= 	'<!ENTITY' S '%' S Name S PEDef S? '>'
    token param_entity_declaration { '<!ENTITY' <space> '%' <space> <name> <space> <param_entity_definition> <space>? '>' };

# [74] 	PEDef 	::= 	EntityValue | ExternalID
    token param_entity_definition { [ <entity_value> | <external_id> ] };

# [75] 	ExternalID 	 ::= 	'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 
    token external_id { ['SYSTEM' <space> <system_literal> | 'PUBLIC' <space> <public_id_literal> <space> <system_literal> ] };

# [76] 	NDataDecl 	 ::= 	S 'NDATA' S Name
    token ndata_declaration { <space> 'NDATA' <space> <name> };

# [78]   	extParsedEnt	   ::=   	 TextDecl? content  
    token extern_parse_entity { <text_declaration>? <content> };
# [77]   	TextDecl	   ::=   	'<?xml' VersionInfo? EncodingDecl  S? '?>'
    token text_declaration {  '<?xml' <version_info>? <encoding_declaration>  <space>? '?>' };

# [78] deleted from specification

# [80] 	EncodingDecl 	 ::= 	S 'encoding' Eq ('"' EncName '"' |  "'" EncName "'" )
    token encoding_declaration { <space> 'encoding' <eq> (<['"]>) <encoding_name> $0 };

# [81] 	EncName 	::= 	[A-Za-z] ([A-Za-z0-9._] | '-')*
    token encoding_name { <[A..Z a..z]> <[A..Z a..z 0..9 ._\- ]>* };

# Notation Declarations
# [82] 	NotationDecl 	::= 	'<!NOTATION' S Name S (ExternalID |  PublicID) S? '>'
# [83] 	PublicID 	::= 	'PUBLIC' S PubidLiteral 
    token notation_declaration { '<!NOTATION' <space> <name> <space> [ <external_id> | 'PUBLIC' <space> <public_id_literal> ] <space>? '>' };

# [84] 	Letter 	 ::= 	BaseChar | Ideographic
#    token lett { <[a..zA..Z]> }; #TODO Unicode letter predefined? 

# [88] 	Digit 	 ::= 	[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29] 
#    token digi { \d }; #TODO Unicode digit predifined?

};
1;
