envlang-csharp/Lexer.cs

176 lines
6.0 KiB
C#

using System;
using System.Text;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Immutable;
using System.Globalization;
using C = System.Globalization.UnicodeCategory;
using static Global;
public static partial class Lexer {
public sealed class EOF { }
public static class Rules {
private static Rule Rule(S oldState, UnicodeCategory cat, S throughState, S newState = null)
=> new Rule(
oldState,
cat.ToString(),
c => c.codePoints
.First()
.Match(some: (x => x.UnicodeCategory(0) == cat),
none: false),
throughState,
newState ?? throughState);
private static Rule Rule(S oldState, EOF eof, S throughState, S newState = null)
=> new Rule(
oldState,
"End of file",
c => c.endOfFile,
throughState,
newState ?? throughState);
private static string CharDescription(char c)
=> (c == '"') ? "'\"'" : $"\"{c.ToString()}\"";
private static Rule Rule(S oldState, char c, S throughState, S newState = null)
=> new Rule(
oldState,
CharDescription(c),
x => x.codePoints
.Single()
.Match(some: xx => xx == c.ToString(),
none: false),
throughState,
newState ?? throughState);
private static Rule Rule(S oldState, char[] cs, S throughState, S newState = null) {
var csl = cs.Select(x => x.ToString()).ToImmutableList();
return new Rule(
oldState,
", ".Join(cs.Select(CharDescription)),
x => x.codePoints.Single().Match(some: csl.Contains, none: false),
throughState,
newState ?? throughState);
}
public static EOF EOF = new EOF();
public static ImmutableList<Rule> Default = ImmutableList(
Rule(S.Space, C.DecimalDigitNumber, S.Int),
Rule(S.Space, C.SpaceSeparator, S.Space),
Rule(S.Space, EOF, S.End),
Rule(S.Space, '"', S.StringOpen, S.String),
Rule(S.Int, C.DecimalDigitNumber, S.Int),
Rule(S.Int, C.SpaceSeparator, S.Space),
Rule(S.Int, new[]{'.',','}, S.Decimal, S.Int),
Rule(S.Decimal, C.SpaceSeparator, S.Space),
Rule(S.String, C.LowercaseLetter, S.String),
Rule(S.String, C.UppercaseLetter, S.String),
Rule(S.String, C.DecimalDigitNumber, S.String),
Rule(S.String, '"', S.StringClose, S.Space)
);
public static ImmutableDefaultDictionary<S, List<Rule>> Dict =
Default
.GroupBy(r => r.oldState, r => r)
.ToImmutableDefaultDictionary(
new List<Rule>(),
rs => rs.Key,
rs => rs.ToList()) ;
// This adds transitions through an implicit empty whitespace.
public static ImmutableDefaultDictionary<S, List<Rule>> WithEpsilonTransitions =
Dict.ToImmutableDefaultDictionary(
new List<Rule>(),
kv => kv.Key,
kv => kv.Value.Any(r => true) // r.test(" ")
// This is a bit of a hack, the lexer tries the rules in
// order so later rules with different results are masked
// by former rules
? kv.Value.Concat(Dict[S.Space]).ToList()
: kv.Value);
}
public struct Lexeme {
public readonly S state;
// TODO: maybe keep this as a list of grapheme clusters
public readonly string lexeme;
public Lexeme(S state, string lexeme) {
this.state = state;
this.lexeme = lexeme;
}
public override string ToString() {
return $"new Lexeme({state}, \"{lexeme}\")";
}
}
private static IEnumerable<Lexeme> Transition(ref S state, ref string lexeme, GraphemeCluster c, Rule rule) {
List<Lexeme> result = new List<Lexeme>();
if (rule.throughState != state) {
result.Add(new Lexeme(state, lexeme));
state = rule.throughState;
lexeme = "";
}
lexeme += c.str;
if (rule.newState != state) {
result.Add(new Lexeme(state, lexeme));
state = rule.newState;
lexeme = "";
}
return result;
}
public static ParserErrorException ParserError(StringBuilder context, IEnumerator<GraphemeCluster> stream, S state, List<Rule> possibleNext, GraphemeCluster gc) {
var rest =
stream
.SingleUseEnumerable()
.TakeUntil(c => c.str.StartsWith("\n"))
.Select(c => c.str)
.JoinWith("");
var expected = ", ".Join(possibleNext.Select(p => p.description));
var actual = (gc.endOfFile ? "" : "grapheme cluster ") + gc.Description();
var cat = gc.codePoints
.First()
.Match(some: (x => x.UnicodeCategory(0).ToString()),
none: "None (empty string)");
return new ParserErrorException(
$"Unexpected {actual} (Unicode category {cat}) while the lexer was in state {state}: expected one of {expected}{Environment.NewLine}{context} <--HERE {rest}"
);
}
// fake Unicode category
private const UnicodeCategory EndOfFile = (UnicodeCategory)(-1);
public static IEnumerable<IEnumerable<Lexeme>> Lex1(string source) {
var context = new StringBuilder();
var lexeme = "";
var state = S.Space;
var e = source.TextElements().GetEnumerator();
while (e.MoveNext()) {
var c = e.Current;
context.Append(c.str);
var possibleNext = Rules.WithEpsilonTransitions[state];
yield return
possibleNext
.First(r => r.test(c))
.IfSome(rule => Transition(ref state, ref lexeme, c, rule))
.ElseThrow(() => ParserError(context, e, state, possibleNext, c));
}
}
public static IEnumerable<Lexeme> Lex(string source) {
var first = true;
foreach (var x in Lex1(source).SelectMany(x => x)) {
if (first && "".Equals(x.lexeme)) {
// skip the initial empty whitespace
} else {
first = false;
yield return x;
}
}
}
}