Strumenta · ftomassetti · Sep 2, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/core/src/main/kotlin/com/strumenta/kolasu/model/Position.kt b/core/src/main/kotlin/com/strumenta/kolasu/model/Position.kt
@@ -243,3 +243,96 @@ val Node.startLine: Int?
 
 val Node.endLine: Int?
     get() = this.position?.end?.line
+
+/**
+ * Given the specified text and point it produces a position that will cover the text, minus the whitespace.
+ *
+ * If a null text is specified then a null position is returned.
+ *
+ * If a text with no leading or trailing whitespace is returned than this will return a position:
+ * - starting at the given start point
+ * - ending to the end point calculated "adding" the text to the start point
+ *
+ * If the text has leading whitespace, the start point will be advanced to skip such whitespace.
+ * Similarly, if the text has trailing whitespace the end point will be receded to skip such whitespace.
+ */
+fun strippedPosition(
+    text: String?,
+    start: Point
+): Position? {
+    return text?.let { text ->
+        start.positionWithLength(text.length).stripPosition(text)
+    }
+}
+
+/**
+ * See strippedPosition.
+ */
+fun Position.stripPosition(text: String): Position {
+    if (text.isNotEmpty()) {
+        when (text.first()) {
+            ' ' -> return this.advanceStart().stripPosition(text.substring(1))
+        }
+    }
+    if (text.isNotEmpty()) {
+        when (text.last()) {
+            ' ' -> return this.recedeEnd().stripPosition(text.substring(0, text.length - 1))
+        }
+    }
+    val maxEnd = this.start + text
+    if (maxEnd.isBefore(this.end)) {
+        return Position(start, maxEnd)
+    }
+    return this
+}
+
+fun Position.advanceStart(): Position {
+    return Position(Point(start.line, start.column + 1), end)
+}
+
+fun Position.recedeEnd(): Position {
+    return Position(start, Point(end.line, end.column - 1))
+}
+
+private fun <K, V> createLeastRecentlyUsedMap(maxEntries: Int = 100): Map<K, V> {
+    return object : LinkedHashMap<K, V>(maxEntries * 10 / 7, 0.7f, true) {
+        override fun removeEldestEntry(eldest: Map.Entry<K, V>): Boolean {
+            return size > maxEntries
+        }
+    }
+}
+
+private object LinesSplitter {
+    val cache = createLeastRecentlyUsedMap<String, List<String>>() as MutableMap<String, List<String>>
+
+    fun getLines(code: String): List<String> {
+        return cache.getOrPut(code) {
+            code.split("(?<=\n)".toRegex())
+        }
+    }
+}
+
+/**
+ * Given a piece of code, it extracts from it the substring at the given position.
+ */
+fun String.codeAtPosition(position: Position): String {
+    try {
+        val lines = LinesSplitter.getLines(this)
+        var res: String
+
+        var currLine = position.start.line
+        if (position.start.line == position.end.line) {
+            return lines[currLine - 1].substring(position.start.column, position.end.column)
+        }
+        res = lines[currLine - 1].substring(position.start.column)
+        currLine++
+        while (currLine <= lines.size && currLine < position.end.line) {
+            res += lines[currLine - 1]
+            currLine++
+        }
+        res += lines[currLine - 1].substring(0, position.end.column)
+        return res
+    } catch (t: Throwable) {
+        throw RuntimeException("Unable to get position $position in text:\n```$this```")
+    }
+}
diff --git a/core/src/main/kotlin/com/strumenta/kolasu/parsing/KolasuParser.kt b/core/src/main/kotlin/com/strumenta/kolasu/parsing/KolasuParser.kt
@@ -97,7 +97,9 @@ abstract class KolasuANTLRLexer<T : KolasuToken>(val tokenFactory: TokenFactory<
                     break
                 } else {
                     if (!onlyFromDefaultChannel || t.channel == Token.DEFAULT_CHANNEL) {
-                        tokens.add(tokenFactory.convertToken(t))
+                        if (t.type != Token.EOF) {
+                            tokens.add(tokenFactory.convertToken(t))
+                        }
                         last = t
                     }
                 }

diff --git a/core/src/main/kotlin/com/strumenta/kolasu/parsing/Parsing.kt b/core/src/main/kotlin/com/strumenta/kolasu/parsing/Parsing.kt
@@ -67,7 +67,12 @@ data class TokenCategory(val type: String) {
         val KEYWORD = TokenCategory("Keyword")
         val NUMERIC_LITERAL = TokenCategory("Numeric literal")
         val STRING_LITERAL = TokenCategory("String literal")
+        val OTHER_LITERAL = TokenCategory("Other literal")
         val PLAIN_TEXT = TokenCategory("Plain text")
+        val WHITESPACE = TokenCategory("Whitespace")
+        val IDENTIFIER = TokenCategory("Identifier")
+        val PUNCTUATION = TokenCategory("Punctuation")
+        val OPERATOR = TokenCategory("Operator")
     }
 }
 
@@ -224,7 +229,8 @@ interface KolasuLexer<T : KolasuToken> : Serializable {
     /**
      * Performs "lexing" on the given code stream, i.e., it breaks it into tokens.
      */
-    fun lex(file: File): LexingResult<T> = BufferedInputStream(FileInputStream(file)).use { lex(it) }
+    fun lex(file: File, charset: Charset = Charsets.UTF_8, onlyFromDefaultChannel: Boolean = true): LexingResult<T> =
+        BufferedInputStream(FileInputStream(file)).use { lex(it, charset, onlyFromDefaultChannel) }
 }
 
 fun Lexer.injectErrorCollectorInLexer(issues: MutableList<Issue>) {

diff --git a/core/src/main/kotlin/com/strumenta/kolasu/testing/LexerTesting.kt b/core/src/main/kotlin/com/strumenta/kolasu/testing/LexerTesting.kt
@@ -0,0 +1,89 @@
+package com.strumenta.kolasu.testing
+
+import com.strumenta.kolasu.model.START_POINT
+import com.strumenta.kolasu.model.codeAtPosition
+import com.strumenta.kolasu.parsing.KolasuLexer
+import com.strumenta.kolasu.parsing.KolasuToken
+import java.io.File
+import java.io.InputStream
+import java.nio.charset.Charset
+import kotlin.test.assertEquals
+
+fun<T : KolasuToken> checkFileTokenization(
+    file: File,
+    lexer: KolasuLexer<T>,
+    charset: Charset = Charsets.UTF_8
+): List<T> {
+    require(file.exists())
+    require(file.isFile())
+    require(file.canRead())
+    val code = file.readText(charset = charset)
+    return checkTokenization(code, lexer)
+}
+
+fun<T : KolasuToken> checkTokenization(
+    inputStream: InputStream,
+    lexer: KolasuLexer<T>,
+    charset: Charset = Charsets.UTF_8
+): List<T> {
+    val code = inputStream.bufferedReader(charset = charset).use { it.readText() }
+    return checkTokenization(code, lexer)
+}
+
+fun<T : KolasuToken> checkTokenization(code: String, lexer: KolasuLexer<T>): List<T> {
+    val lexingResult = lexer.lex(code, onlyFromDefaultChannel = false)
+    require(lexingResult.issues.isEmpty()) {
+        "Lexing issues occurred: ${lexingResult.issues}"
+    }
+    checkTokensAreCoveringText(code, lexingResult.tokens)
+    return lexingResult.tokens
+}
+
+fun<T : KolasuToken> checkTokensAreCoveringText(code: String, tokens: List<T>) {
+    require(code.isEmpty() == tokens.isEmpty())
+    if (code.isEmpty()) {
+        return
+    }
+
+    // Tokens should be in order and they should cover without gaps or overlaps
+    // the text from the very start to the very end of the code
+
+    var prevToken: KolasuToken? = null
+    tokens.forEach { token ->
+        if (prevToken == null) {
+            // This is the first token, so we should start at the very beginning
+            assertEquals(
+                token.position.start,
+                START_POINT,
+                "The first token is expected to be at the start position $START_POINT while it is ast " +
+                    "${token.position.start}"
+            )
+        } else {
+            assertEquals(
+                token.position.start,
+                prevToken!!.position.end,
+                "Token $token does not immediately follow $prevToken"
+            )
+        }
+
+        // The text specified in tokens should be as long as the position indicated
+        assertEquals(
+            token.position.start + token.text,
+            token.position.end,
+            "We have a token with position ${token.position} and text '${token.text}'"
+        )
+
+        // The text specified in the tokens should correspond to the corresponding code
+        val expectedText = code.codeAtPosition(token.position)
+        assertEquals(
+            expectedText,
+            token.text,
+            "At position ${token.position} we found '${token.text}' while we expected '$expectedText'"
+        )
+
+        prevToken = token
+    }
+
+    val codeEnd = START_POINT + code
+    assertEquals(prevToken!!.position.end, codeEnd)
+}