Add KDL2 grammar in KDL

This commit is contained in:
eugenesvk 2025-01-12 16:15:56 +07:00
parent 39a098237d
commit b046014cf6
2 changed files with 120 additions and 1 deletions

View File

@ -837,7 +837,8 @@ except for non Unicode Scalar Value, which can't be represented even as escapes)
This is the full official grammar for KDL and should be considered
authoritative if something seems to disagree with the text above. The [grammar
language syntax](#grammar-language) is defined below.
language syntax](#grammar-language) is defined below. For a version converted to KDL,
see [examples/kdl-grammar2.kdl](./examples/kdl-grammar2.kdl).
```
document := bom? version? nodes

118
examples/kdl-grammar2.kdl Normal file
View File

@ -0,0 +1,118 @@
/-kdl-version 2
§ "KDL Grammar" spec=(🔗)"github.com/kdl-org/kdl/blob/main/SPEC.md" !="See symbol defs at the bottom"\
document = bom? version? nodes \
version =≝ /- u␠* "kdl-version" u␠+ ⦅ "1" | "2" ⦆ u␠* ␤ \
nodes = ⦅l_␠* node⦆* l_␠* \
node = node_base node_end \
node_chlast = node_base node_end? \
node_base =≝ /-? 🄣? n␠* string \
⦅n␠+ /-? node_proparg ⦆* \
⦅n␠+ /- node_children ⦆* !="Slashdashed node_children must be after props and args" \
⦅n␠+ node_children ⦆? !="Only children may follow a slashdashed child" \
⦅n␠+ /- node_children ⦆* \
n␠*
§ "Entries" \
node_proparg = prop | argument \
prop = string n␠* "=" n␠* 🄣? n␠* value \
argument = 🄣? n␠* value \
🄣 = "(" n␠* string n␠* ")" \
value = string | number | boolean | #null \
node_children = "{" nodes node_chlast? "}" \
node_end = comment_line | ␤ | | eof
§ "Strings" \
string = str🆔 | string'' | string_raw ¶ \
str🆔 = clean🆔 | signed🆔 | dotted🆔 \
clean🆔 = ⦅⦅char🆔 09 ± .⦆ char🆔*⦆ keyword🆔🛑 \
signed🆔 = ± ⦅⦅char🆔 09 .⦆ char🆔*⦆? \
dotted🆔 = ±? . ⦅⦅char🆔 09 ⦆ char🆔*⦆? \
char🆔 = uchar u␠ char🆔¬ char🛑 \
char🆔¬ = #"(;){"}[#]/=\"# \
keyword🆔🛑 = true|false|null|inf|-inf|nan \
string'' = “ ⦅ char_str ␤⦆* ” \
|= “““ \
⦅␤ de␠e≡ ⦅“?“? char_str ␤ - u␠⦆ ⦅“?“? char_str ␤⦆* ⦆* \
⦅␤ ⦅ u␠⦆* ⦆* \
␤ de␠e* ””” !="de␠e≡ must exactly match de␠e; only required before non-whitespace" \
char_str = char⎋ | uni⎋ | w␠⎋ | 〔¬⧵“〕 char🛑 \
char⎋ = bfnrts \
uni⎋ = "u{" uchar₁₆ "}" \
uchar₁₆ = 010FFFF (surrogate)D800DFFF !="ranges in \u{F} format, but represented as hex₁₆, not actual symbols, so \u{B0} is 'B0', not '°', can be 0-padded up to max len of 6"\
w␠⎋ = ⦅u␠|␤⦆+ \
de␠e = u␠ | w␠⎋ \
string_raw = * string_raw_quoted *ℕ≡ !="=1∞ ℕ≡ must match " \
string_raw_quoted = “ string_raw_body_line ” \
|= “““ ␤ string_raw_body☰line ␤ de␠u* ””” \
string_raw_body_line = "" \
|= ⦅char_raw ”⦆ char_raw*? \
|= “ ⦅char_raw ”⦆ char_raw*? \
char_raw = uchar char🛑 \
string_raw_body☰line = de␠u≡ ⦅uchar char🛑⦆*? !="←de␠u must exactly match ≡↑" \
de␠u = u␠
§ "Numbers (space in is ignored)" \
number = float_keyword | hex | octal | binary | decimal \
hex = ±?0x 🔢₁₆ ⦅_ | 🔢₁₆⦆* \
octal = ±?0o 07 07_* \
binary = ±?0b 01 01_* \
decimal = ±? integer ⦅. integer⦆? ? \
integer = 09 09_* \
= eE ±? integer \
\
± = +- \
🔢₁₆ = 09afAF
§ "Keywords and booleans" \
keyword = boolean | #null \
float_keyword = #inf | #-inf | #nan \
boolean = #true | #false
§ "Specific code points" !="\u{0} unicode format is implied: u0=\u{0} AF=\u{A}\u{F} (range)" \
bom = "\u{FEFF}" \
char🛑 = (¬1st)"bom" (c0a)u0u8 (c0b)E1F (cdel)7F (c_dir)200E200F202A202E20662069 usv¬ (🔗spec)"disallowed-literal-code-points" \
usv¬ =(surrogate)≝ (high)D800DBFF (low)DC00DFFF !="valid c0: u9uD 9␉\t A␊\n B␋\\v C␌\f D␍\r" \
uchar = 0D7FFE00010FFFF !="Any Unicode Scalar Value (no surrogates)" \
u␠ = 920A016802000200120022003200420052006200720082009200A202F205F3000 !="Non line-breaking unicode White_Space" (🔗spec)"whitespace"\
"       "\
u␤ = ABCD8520282029 !="(85NEL LS PS) Line-breaking unicode White_Space" (🔗spec)"newline" \
“="\"" ”="\"" ="\\" =";" ="/" *="" ="#"
§ "Comments (// /* */ after = serve as definition symbols)" \
/- ≝ /- l_␠* \
comment_block =≝ /* commented_block
commented_block = */ \
|= ⦅comment_block | * | | 〔¬*+⦆ 🗘 \
comment_line =≝ // ¬␤* ⦅␤ | eof⦆
§ "Whitespace" !="\u{F} unicode format is implied in F and ommited for brevity" \
l_␠ = n␠ | ␤ | comment_line !="Whitespace where newlines are allowed" \
n␠ = cu␠* escline cu␠* | cu␠+ !="Whitespace within nodes, where newline-ish things must be esclined" \
cu␠ = u␠ | comment_block \
escline = cu␠* ⦅comment_line | ␤ | eof⦆ \
␤ = DA | u␤ !="DA \r\n ␍␊ is a sequence, but counts as 1 newline" (🔗spec)"newline"
§ "Grammar language help: ABNF-like with some regex in a " rule="definition" "format" {
• #"\ "“ "” ; / * #"#="literals represented with unicode alts to avoid quotes"
• #"\ " ; = "#="regular KDL syntax, not grammar"\
#" "x" "#="double quotes for literals" ##" #"x"# "##="raw+double quotes for literals"\
∅=(empty)"" #"\"#=(text)"escape \" \\ or add uchar in hex \u{FEFF}"\
keys=(and_types)"extra group info" ≝="filler to allow /-comments in prop values"
• |="logical OR" !="tip/comment" §="section marker" 🗘="refers to self (rule name left of =)"
Regexmatches \
?="0|1 zero or one" \
+="1+ one or more ( greedy)" \
*="0+ zero or more ( greedy, match as many instances as possible)" \
*?="0+ zero or more (non-greedy, match as few instances as possible; used only in raw strings)" \
*16=" inclusive range: at least 1 at most 6" \
"09"="char range (inclusive)" ="en-dash used to not blend with hyphen-minus in search" \
¬="not this (^ in regex)" ¬foo="must not match 'foo'" 〔¬⧵“〕=#"[^\"] any char except for \ or " "# \
set="[a b]" #"set"#="[ab]" ≝="regex Char set matches like [], where any char will be a single match"\
"↑ ignore ↑ contained spaces (for visual separation or groups and alignment)"
⦅group⦆ "items must be matched together"\
a|b="'a or b', whichever matches 1st. Multiple items before | are a single group" "a b c | d"="⦅a b c⦆ | d"\
≡="defined to be identical to another similarly named match"
="minus sign; 'except for' whatever follows it" "uchar␤"="match uchar rule, but not ‘␤’ rule"
• ¶="cut point: always matches and consumes no chars, but once matched, bans backtracking past that point in the source. If a parser would rewind past ¶, it must instead fail the overall parse, as if it had run out of options (only for string_raw to ensure the 1st instance of the appropriate closing quote sequence ends it)"
• "A single definition may be split over multiple lines. Newlines are treated as spaces"
}