-
Notifications
You must be signed in to change notification settings - Fork 128
/
parseTOML.nix
211 lines (189 loc) · 6.49 KB
/
parseTOML.nix
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
with builtins;
# Tokenizer.
let
layout_pat = "[ \n]+";
layout_pat_opt = "[ \n]*";
token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[[][^]]+[]]|[a-zA-Z0-9_-]+|"[^"]*"''; #"
tokenizer_1_11 = str:
let
tokenizer_rec = len: prevTokens: patterns: str:
let
pattern = head patterns;
layoutAndTokens = match pattern str;
matchLength = stringLength (head layoutAndTokens);
tokens = prevTokens ++ tail layoutAndTokens;
in
if layoutAndTokens == null then
# if we cannot reduce the pattern, return the list of token
if tail patterns == [] then prevTokens
# otherwise, take the next pattern, which only captures half the token.
else tokenizer_rec len prevTokens (tail patterns) str
else tokenizer_rec len tokens patterns (substring matchLength len str);
avgTokenSize = 100;
ceilLog2 = v:
let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
inner 1 1;
# The builtins.match function match the entire string, and generate a list of all captured
# elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
# capture all token of the file. Unfortunately C++ std::regex does not support captures in
# repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
# of 2, such that we can avoid iterating too many times over the content.
generatePatterns = str:
let
depth = ceilLog2 (stringLength str / avgTokenSize);
inner = depth:
if depth == 0 then [ "(${token_pat})" ]
else
let next = inner (depth - 1); in
[ "${head next}${layout_pat}${head next}" ] ++ next;
in
map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);
in
tokenizer_rec (stringLength str) [] (generatePatterns str) str;
tokenizer_1_12 = str:
let
# Nix 1.12 has the builtins.split function which allow to tokenize the
# file quickly. by iterating with a simple regexp.
layoutTokenList = split "(${token_pat})" str;
isLayout = s: match layout_pat_opt s != null;
filterLayout = list:
filter (s:
if isString s then
if isLayout s then false
else throw "Error: Unexpected token: '${s}'"
else true) list;
removeTokenWrapper = list:
map (x: assert tail x == []; head x) list;
in
removeTokenWrapper (filterLayout layoutTokenList);
tokenizer =
if builtins ? split
then tokenizer_1_12
else tokenizer_1_11;
in
# Parse entry headers
let
unescapeString = str:
# Let's ignore any escape character for the moment.
assert match ''"[^"]*"'' str != null; #"
substring 1 (stringLength str - 2) str;
# Match the content of TOML format section names.
ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"''; #"
removeBraces = token: wrapLen:
substring wrapLen (stringLength token - 2 * wrapLen) token;
# Note, this implementation is limited to 11 identifiers.
matchPathFun_1_11 = token:
let
# match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
header_pat =
foldl' (pat: n: "(${ident_pat})([.]${pat})?")
"(${ident_pat})" (genList (n: 0) 10);
matchPath = match header_pat token;
filterDot = filter (s: substring 0 1 s != ".") matchPath;
in
filterDot;
matchPathFun_1_12 = token:
map (e: head e)
(filter (s: isList s)
(split "(${ident_pat})" token));
matchPathFun =
if builtins ? split
then matchPathFun_1_12
else matchPathFun_1_11;
headerToPath = token: wrapLen:
let
token' = removeBraces token wrapLen;
matchPath = matchPathFun token';
path =
map (s:
if substring 0 1 s != ''"'' then s #"
else unescapeString s
) matchPath;
in
assert matchPath != null;
# assert trace "Path: ${token'}; match as ${toString path}" true;
path;
in
# Reconstruct the equivalent attribute set.
let
tokenToValue = token:
if token == "true" then true
else if token == "false" then false
# TODO: convert the TOML list into a Nix list.
else if match "[[][^]]+[]]" token != null then token
else unescapeString token;
parserInitState = {
idx = 0;
path = [];
isList = false;
output = [];
elem = {};
};
# Imported from nixpkgs library.
setAttrByPath = attrPath: value:
if attrPath == [] then value
else listToAttrs
[ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];
closeSection = state:
state // {
output = state.output ++ [ (setAttrByPath state.path (
if state.isList then [ state.elem ]
else state.elem
)) ];
};
readToken = state: token:
# assert trace "Read '${token}'" true;
if state.idx == 0 then
if substring 0 2 token == "[[" then
(closeSection state) // {
path = headerToPath token 2;
isList = true;
elem = {};
}
else if substring 0 1 token == "[" then
(closeSection state) // {
path = headerToPath token 1;
isList = false;
elem = {};
}
else
assert match "[a-zA-Z0-9_-]+" token != null;
state // { idx = 1; name = token; }
else if state.idx == 1 then
assert token == "=";
state // { idx = 2; }
else
assert state.idx == 2;
state // {
idx = 0;
elem = state.elem // {
"${state.name}" = tokenToValue token;
};
};
# aggregate each section as individual attribute sets.
parser = str:
closeSection (foldl' readToken parserInitState (tokenizer str));
fromTOML = toml:
let
sections = (parser toml).output;
# Inlined from nixpkgs library functions.
zipAttrs = sets:
listToAttrs (map (n: {
name = n;
value =
let v = catAttrs n sets; in
# assert trace "Visiting ${n}" true;
if tail v == [] then head v
else if isList (head v) then concatLists v
else if isAttrs (head v) then zipAttrs v
else throw "cannot merge sections";
}) (concatLists (map attrNames sets)));
in
zipAttrs sections;
in
{
testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
testing_url = fromTOML (builtins.readFile (builtins.fetchurl
"https://static.rust-lang.org/dist/channel-rust-nightly.toml"));
inherit fromTOML;
}