-
Notifications
You must be signed in to change notification settings - Fork 71
/
class_lexer.ahk
389 lines (363 loc) · 14.6 KB
/
class_lexer.ahk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Lexer class
;; Copyright (C) 2012 Adrian Hawryluk
;;
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#include <lineReader>
#include <misc>
#include <ClassCheck>
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LEXER CLASS:
;
; This is a abstract class for a lexer. It is abstract since there are
; required functions that are missing from this class that need to be
; added by the extending (derriving from) this class. You cannot use this
; class on its own and will fail upon attempting on calling one of these
; functions.
;
; The extending class is required to implement the following functions:
;
; 1. tokenizer(lineReader, tokens, gettingLineContinuation)
;
; 2. someTokensAreNotIgnored([byref] tokens)
;
; 3. preparser([byref] tokens)
;
; tokenizer()
;
; The tokenizer() breaks up a line that it retreives from the linereader
; class instance into an array of tokens that it then adds to the token
; array passed to it.
;
; If gettingLineContinuation is non-zero, this tells the function
; that the line is incomplete on its own and that it needs more to be
; completed. Simply forward this value to lineReader.GetLine() like so:
;
; if (lineReader.GetLine(startPos, endPos, gettingLineContinuation))
;
; See lineReaderString class for more details
;
; Successful extraction of any tokens from the lineReader should result
; in the return of a 1. Note that the successful extraction of 0 tokens
; may occur in the case when the line is empty (endPos = startPos-1).
;
; If the lineReader.GetLine() call fails (returns 0) then some end of stream
; token may be added to the end of the tokens array, but a return of 0 from
; the tokenizer() function must occur.
;
; Multiline tokens are possible to lex by determining that the character
; sequence has been found without the end sequence. You can then read in the
; rest of the contents of the token by calling tokenizer_readMultiLineToken()
; or drop its contents by calling tokenizer_dropMultiLineToken().
;
; someTokensAreNotIgnored()
;
; If a call to this function returns 0, then the next line of tokens are
; read in since there is nothing for the parser to do, otherwise the
; generated tokes are released to the parser.
;
; By having tokens parameter as a byref, you can remove all ignored tokens
; immediately by setting it to [] if needed.
;
; preparser()
;
; This preparses the line of tokens prior to calling someTokensAreNotIgnored()
; and prior to setting/appending these new tokens into the current
; token list used by the parser.
;
; If a call to this function returns 0 then another line of tokens are added
; to the current set. Returning 1 states that the line is complete and that
; it can be passed on to the parser.
;
; Although the tokenizer() could find the end of the line, the preparser is
; called once a line has been tokenized() so if you are adding an end of line
; token, it may be advantagious performance wise to add it here rather then
; use another regex in the tokenizer().
;
; By having tokens parameter as a byref, you can remove all ignored tokens
; immediately by setting it to [] if needed.
;
; USAGE:
;
; Using the lexer is fairly simple, you instantiate a lineReader and pass it
; to a new instance of a lexer. You then extract the tokens using the get
; functions and can mark a position to redo a search path later if it is
; discovered that a grammer match has failed and an alternate exists, allowing
; it to process LL(k) grammers.
;
; The lexer does not enforce the type or the structure of each token of the
; tokens list by design allowing the programmer to optimise as they see fit.
; The original design was that of a more traditional lexer where each token
; was given numeric designation (an enumeration), but I realised that this
; could be a perfomance issue in a scripting language, so I was very happy
; when I found a way to seperate token representation from implementation of
; this lexer. However, the enum system is left in case there becomes some
; reason to use it. I personally use a flag system which can be seen in use
; in the c.lexer implmentation.
;
; Using the Enums
;
; The extending class defines the static variables tnames and tenums and
; assigns to them lexer.addTokenNames("lexerToken1", ..., "lexerTokenN")
; and <extendingClassName>.generateEnums() respectivly.
;
; What this does is create an enum list of token names that start with
; END_OF_STREAM(enum value of 0) and INCOMPLETE_TOKEN(1), followed by
; the token names that were specified. To go back from an enum value to
; what the token's name is, use the tenums static varible.
;
; Extracting a Token
;
; Use the get functions. If l is the lexer object, the commands would be
; l.getInc(), l.incGet(), l.getDec(), l.decGet(), l.getPrev(), l.getNext(),
; l.get() and l[x], where x is any integer representing an offset from the
; current token represented by l.t_pos. Manually chaning l.t_pos can be
; done, but a value less than 1 indicates a stale token (token has been
; dropped from the token cache). If the token cache does not have the token
; as pointed at by l.t_pos and it is a positive value, then calling a get
; function will result in an attempt to get that token by reading in as many
; lines as it takes or until it reaches the end of the lineReader stream which
; would cause an error.
;
; The first four get functions will change the l.t_pos pointer. l.getInc()
; and l.getDec() gets the tokens and then moves the l.t_pos pointer.
; l.incGet() and l.decGet() moves the l.t_pos pointer and then gets the token.
;
; The last four get functions do not move the l.t_pos pointer.
;
; Tokens in the token cache can become stale when reading in the next line
; unless a hold function is used or when using l[x] which never drops stale
; tokens. Use l[x] sparingly or you may reduce perfomance and increase
; memory usage.
;
; Hold functions
;
; To be able to backtrack and test another potential grammer match, the hold
; functions can be used to hold off removing tokens when reading in the next
; line. Again, if l is the lexer object, the function calls would be
; l.holdTokens(), l.holdTokensRelease() and l.holdTokensRevert().
;
; l.holdTokens() holds all tokens currently in the cache and remembers
; the current token position so that it can be returned to using
; l.holdTokensRevert(). If a successful gramatic parse occurs use the
; l.holdTokensRelease() function.
;
; For every l.holdTokens() there should be a l.holdTokensRelease() or the
; application report an error due to an invalid gramatic parse. Not
; releasing a hold position could result in reduced performance and an
; increase in memory usage.
;
; Detect end of token stream
;
; The member function hasMoreTokens() ensures that the token cache is updated
; (like the get functions) as well, it will return a 1 if there are more
; tokens are available. It will return 0 if there is not.
;
; END - ENJOY!
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
class lexer extends CheckCallNameExists
{
static tnames := lexer.addTokenNames("INCOMPLETE_TOKEN")
static tenums := lexer.generateEnums()
addTokenNames(tokenNames*)
{
if ((minIndex := (tnames := this.tnames.Clone()).MinIndex()) == "")
tnames := { 0: "END_OF_STREAM" }
else if (minIndex == 1)
tnames.insert(0, "END_OF_STREAM")
tnames.insert(tnames.MaxIndex()+1, tokenNames*)
return tnames
}
generateEnums()
{
tenums := {}
for tenum, tname in this.tnames
tenums[tname] := tenum
return tenums
}
tokens := [] ; token cache
t_pos := 1 ; current index used for tokens. t_pos > t_max will require that the cache be refilled.
; stack indicating tokens in use state that these tokens are not to be removed when caching more tokens.
t_max := 0 ; slight optimisation of tokens.MaxIndex()
; if you parse something that is included from within currentParseFrom, then push currentParseFrom here. When done pop it off and put it in currentParseFrom
defines := [] ; defines
currentParseFrom := 0 ; this is to be a lineReader object
; set to the last parsed token from within the tokenizer().
; This allows seperation between the data representation of the token stream and the lexer.
; All the lexer knows about the representation is that it is an array, but not what each element in the array holds.
incompleteToken := 0
__New(lineReader)
{
this.currentParseFrom := lineReader
}
incGet()
{
if (this.t_pos >= this.t_max)
this.cacheMoreTokens()
return this.tokens[++this.t_pos]
}
getInc()
{
if (this.t_pos > this.t_max)
this.cacheMoreTokens()
return this.tokens[this.t_pos++]
}
decGet()
{
if (this.t_pos <= 1)
FAIL("Trying to retreive stale token in cache store.")
return this.tokens[--this.t_pos]
}
getDec()
{
if (this.t_pos <= 1)
FAIL("Trying to move token pointer to stale cache location.")
return this.t_pos--
}
__Get(offset)
{
getAtIndex := this.t_pos
if (1 <= getAtIndex)
{
while (getAtIndex > this.t_max)
{
this.cacheMoreTokens(1, 1) ; make sure to hold on to currently cached tokens or current t_pos will become wrong
}
return this.tokens[getAtIndex]
}
FAIL("Trying to retreive stale token in cache store.")
}
get()
{
if (this.t_pos > this.t_max)
this.cacheMoreTokens()
return this.tokens[this.t_pos]
}
getPrev()
{
if (this.t_pos <= 1)
FAIL("Trying to retreive stale token in cache store.")
return this.tokens[this.t_pos-1]
}
getNext()
{
if (this.t_pos >= this.t_max)
this.cacheMoreTokens(1, 1) ; make sure to hold on to currently cached tokens or current t_pos will become wrong
return this.tokens[this.t_pos+1]
}
hasMoreTokens()
{
result := 1
; loop while the token at t_pos is not available and there are more tokens available
while this.t_pos > this.t_max and result := this.cacheMoreTokens(0)
{
}
return result
}
; This is a stack of token positions that are being held. This is for
; the case where the parse partially matches, but then fails later,
; allowing to go back and try a different parse.
;
; If this contains any elements, then the token cache will not be cleared, but appended to when getting the next set of tokens.
holdTokensPositionStack := []
; About to test a parse with possibility of failure.
; Push a token position to the token position stack so can backtrack on failure.
holdTokens()
{
this.holdTokensPositionStack.Insert(this.t_pos)
}
; Parse was successful, don't need to backtrack to last held position.
; Pops off the last held token position.
holdTokensRelease()
{
if ((maxIndex := (holdTokensPositionStack := this.holdTokensPositionStack).MaxIndex()) == "")
FAIL("Token release called without a matching hold.")
holdTokensPositionStack.Remove()
}
; Parse was unsuccessful, backtrack and try again by
; reverting to last held position, but do not clear that position
holdTokensRevert()
{
if ((maxIndex := (holdTokensPositionStack := this.holdTokensPositionStack).MaxIndex()) == "")
FAIL("Token revert called without a matching hold.")
this.t_pos := holdTokensPositionStack[maxIndex]
}
; Finishes reading in a block comment token and drops the content
tokenizer_dropMultiLineToken(lineReader, tokens, ByRef startPos, ByRef endPos, byref match, TOKEN_END_RE)
{
while ((readLine := lineReader.GetLine(startPos, endPos, 1)) and !RegExMatch(lineReader.buffer, TOKEN_END_RE, match, startPos))
{
}
if (readLine)
return 1
else
FAIL("Didn't find end of multiline token matching RegEx '" TOKEN_END_RE "' from " this.currentParseFrom.diagnositicInfo())
}
; Finishes reading in a block comment token and returns the content
tokenizer_readMultiLineToken(lineReader, tokens, ByRef startPos, ByRef endPos, byref match, TOKEN_END_RE)
{
while ((readLine := lineReader.GetLine(startPos, endPos, 1)) and !RegExMatch(lineReader.buffer, TOKEN_END_RE, match, startPos))
{
result .= substr(lineReader.buffer, startPos, endPos-startPos+1) "`n"
}
if (readLine)
return result match
else
FAIL("Didn't find end of multiline token matching RegEx '" TOKEN_END_RE "' from " this.currentParseFrom.diagnositicInfo())
}
; If failOnFailure and there are no more tokens to get, then FAIL() , else return 0
; If holdTokens then hold tokens even if holdTokensPositionStack is empty
cacheMoreTokens(failOnFailure = 1, holdTokens = 0)
{
tokens := []
ppDoesNotNeedMore := 1
success := 1
Loop
{
; generate tokens for current line
if (this.tokenizer(this.currentParseFrom, tokens, !ppDoesNotNeedMore))
{ ; intentionally blank
}
else if (ppDoesNotNeedMore)
if (failOnFailure)
FAIL("Ran out of tokens while reading from " this.currentParseFrom.diagnositicInfo())
else
return 0
else
FAIL("Preprocessor required more tokens but stream has terminated unexpectedly while reading from " this.currentParseFrom.diagnositicInfo())
; OPTIMIZATION: Embedded preparser here to save extra processing if more tokens were needed.
; Also removes the need to put tokens somewhere to be preprocessed before appending to/replacing current tokens
; preparser returns true when it doesn't need to read in any additional lines
; if no tokens left due to preparser removing them all, continue from the start of loop
} Until (ppDoesNotNeedMore := this.preparser(tokens)) and this.someTokensAreNotIgnored(tokens)
; remove stale tokens from this.tokens if no tokens held, append otherwise.
; Also adjust t_max.
if (holdTokens or this.holdTokensPositionStack.MaxIndex())
{
; append tokens to current list
this.tokens.Insert(this.t_max + 1, tokens*)
this.t_max += tokens.MaxIndex()
}
else
{
; replace tokens
this.tokens := tokens
this.t_max := tokens.MaxIndex()
; since replacing tokens, t_pos gets reset to begining
this.t_pos := 1
}
return 1
}
}