Skip to content

Commit

Permalink
feat: support basic regex analysers in XML (#828)
Browse files Browse the repository at this point in the history
The `<analyse>` element contains a regex to match against the input, and
a score if the pattern matches.

The scores of all matching patterns for a lexer are summed.

Replaces #815, #813 and #826.
  • Loading branch information
alecthomas committed Aug 21, 2023
1 parent 2226663 commit a20cd7e
Show file tree
Hide file tree
Showing 18 changed files with 46 additions and 175 deletions.
12 changes: 12 additions & 0 deletions lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@ type Config struct {
//
// If this is 0 it will be treated as a default of 1.
Priority float32 `xml:"priority,omitempty"`

// Analyse is a list of regexes to match against the input.
//
// The sum of all the score of matching patterns will be
// used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
}

// AnalyseConfig defines a single regex analyser pattern.
type AnalyseConfig struct {
Regex string `xml:"regex,attr"`
Score float32 `xml:"score,attr"`
}

// Token output to formatter.
Expand Down
37 changes: 0 additions & 37 deletions lexers/c.go

This file was deleted.

44 changes: 0 additions & 44 deletions lexers/c_test.go

This file was deleted.

36 changes: 0 additions & 36 deletions lexers/cpp.go

This file was deleted.

57 changes: 0 additions & 57 deletions lexers/cpp_test.go

This file was deleted.

3 changes: 3 additions & 0 deletions lexers/embedded/c++.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
<filename>*.H</filename>
<filename>*.cp</filename>
<filename>*.CPP</filename>
<filename>*.tpp</filename>
<mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" />
<analyse regex="using namespace " score="0.4" />
</config>
<rules>
<state name="classname">
Expand Down
2 changes: 2 additions & 0 deletions lexers/embedded/c.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
<mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/>
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
</config>
<rules>
<state name="statement">
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions lexers/testdata/analysis/c.ifdef.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1
File renamed without changes.
1 change: 1 addition & 0 deletions lexers/testdata/analysis/c.ifndef.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1
File renamed without changes.
1 change: 1 addition & 0 deletions lexers/testdata/analysis/c.include.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1
File renamed without changes.
1 change: 1 addition & 0 deletions lexers/testdata/analysis/cpp.include.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.2
File renamed without changes.
1 change: 1 addition & 0 deletions lexers/testdata/analysis/cpp.namespace.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.4
25 changes: 24 additions & 1 deletion serialise.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"reflect"
"regexp"
"strings"

"github.com/dlclark/regexp2"
)

// Serialisation of Chroma rules to XML. The format is:
Expand Down Expand Up @@ -107,7 +109,7 @@ func fastUnmarshalConfig(from fs.FS, path string) (*Config, error) {
var config Config
err = dec.DecodeElement(&config, &se)
if err != nil {
panic(err)
return nil, fmt.Errorf("%s: %w", path, err)
}
return &config, nil
}
Expand Down Expand Up @@ -135,8 +137,29 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse {
re, err := regexp2.Compile(ra.Regex, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
return &RegexLexer{
config: config,
analyser: func(text string) float32 {
var score float32
for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok {
score += ra.score
}
}
return score
},
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config
Expand Down

0 comments on commit a20cd7e

Please sign in to comment.