envlang-csharp/Lexer.cs
Suzanne Soy de12594067 WIP
2020-09-01 06:41:07 +00:00

251 lines
9.2 KiB
C#

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Immutable;
using System.Globalization;
using C = System.Globalization.UnicodeCategory;
using static Global;
public static partial class Lexer {
public sealed class EOF { }
public static class Rules {
private static Rule Rule(S oldState, UnicodeCategory cat, S throughState, S newState = null)
=> new Rule(
oldState: oldState,
description: cat.ToString(),
test: c => c.codePoints
.First()
.Match(
Some: x =>
x.UnicodeCategory(0) == cat,
None: false),
throughState: throughState,
newState: newState ?? throughState);
private static Rule Rule(S oldState, EOF eof, S throughState, S newState = null)
=> new Rule(
oldState: oldState,
description: "End of file",
test: c => c.endOfFile,
throughState: throughState,
newState: newState ?? throughState);
private static string CharDescription(char c)
=> (c == '"') ? "'\"'" : $"\"{c.ToString()}\"";
private static Rule Rule(S oldState, char c, S throughState, S newState = null)
=> new Rule(
oldState: oldState,
description: CharDescription(c),
test: x => x.codePoints
.Single()
.Match(
Some: xx => xx == c.ToString(),
None: false),
throughState: throughState,
newState: newState ?? throughState);
private static Rule Rule(S oldState, char[] cs, S throughState, S newState = null) {
var csl = cs.Select(x => x.ToString()).ToImmutableList();
return new Rule(
oldState: oldState,
description: ", ".Join(cs.Select(CharDescription)),
test: x => x.codePoints
.Single()
.Match(Some: csl.Contains,
None: false),
throughState: throughState,
newState: newState ?? throughState);
}
public static EOF EOF = new EOF();
public static ImmutableList<Rule> Default = ImmutableList(
Rule(S.Space, C.DecimalDigitNumber, S.Int),
Rule(S.Space, C.SpaceSeparator, S.Space),
Rule(S.Space, EOF, S.EndOfInput, S.End),
Rule(S.Space, '"', S.StringOpen, S.String),
Rule(S.Space, '=', S.Eq),
Rule(S.Eq, '=', S.Eq, S.Space),
Rule(S.Space, '&', S.And),
Rule(S.And, '&', S.And, S.Space),
Rule(S.Space, '+', S.Plus, S.Space),
Rule(S.Space, '*', S.Times, S.Space),
// TODO: https://unicode.org/reports/tr31/#D1 has a lot to say about
// identifiers
Rule(S.Space, C.LowercaseLetter, S.Ident),
Rule(S.Space, C.UppercaseLetter, S.Ident),
Rule(S.Space, C.TitlecaseLetter, S.Ident),
Rule(S.Space, C.ModifierLetter, S.Ident),
Rule(S.Space, C.OtherLetter, S.Ident),
Rule(S.Space, C.LetterNumber, S.Ident),
Rule(S.Ident, C.LowercaseLetter, S.Ident),
Rule(S.Ident, C.UppercaseLetter, S.Ident),
Rule(S.Ident, C.TitlecaseLetter, S.Ident),
Rule(S.Ident, C.ModifierLetter, S.Ident),
Rule(S.Ident, C.OtherLetter, S.Ident),
Rule(S.Ident, C.LetterNumber, S.Ident),
Rule(S.Int, C.DecimalDigitNumber, S.Int),
Rule(S.Int, C.SpaceSeparator, S.Space),
Rule(S.Int, new[]{'.',','}, S.Decimal, S.Int),
Rule(S.Decimal, C.SpaceSeparator, S.Space),
Rule(S.String, C.LowercaseLetter, S.String),
Rule(S.String, C.UppercaseLetter, S.String),
Rule(S.String, C.DecimalDigitNumber, S.String),
Rule(S.String, '"', S.StringClose, S.Space)
);
public static ImmutableDefaultDictionary<S, List<Rule>> Dict =
Default
.GroupBy(r => r.oldState, r => r)
.ToImmutableDefaultDictionary(
new List<Rule>(),
rs => rs.Key,
rs => rs.ToList()) ;
// This adds transitions through an implicit empty whitespace.
public static ImmutableDefaultDictionary<S, List<Rule>> WithEpsilonTransitions =
Dict.ToImmutableDefaultDictionary(
new List<Rule>(),
kv => kv.Key,
kv => kv.Value.Any(r => true) // r.test(" ")
// This is a bit of a hack, the lexer tries the rules in
// order so later rules with different results are masked
// by former rules
? kv.Value.Concat(Dict[S.Space]).ToList()
: kv.Value);
}
public partial class Lexeme {
private string CustomToString() {
return $"new Lexeme({state}, \"{lexeme}\")";
}
}
private static ValueTuple<S, string, IImmutableEnumerator<Lexeme>> Transition(S state, string lexeme, GraphemeCluster c, Rule rule) {
List<Lexeme> result = new List<Lexeme>();
if (rule.throughState != state) {
result.Add(new Lexeme(state, lexeme));
state = rule.throughState;
lexeme = "";
}
lexeme += c.str;
if (rule.newState != state) {
result.Add(new Lexeme(state, lexeme));
state = rule.newState;
lexeme = "";
}
return (state, lexeme, result.GetImmutableEnumerator());
}
public static ParserErrorException ParserError(IImmutableEnumerator<GraphemeCluster> context, IImmutableEnumerator<GraphemeCluster> rest, S state, List<Rule> possibleNext, GraphemeCluster gc) {
var strContext =
context
.ToIEnumerable()
.TakeUntil(c => c.Equals(rest))
.Select(c => c.str)
.JoinWith("");
var strRest =
rest
.TakeUntil(c => c.str.StartsWith("\n"))
.Select(c => c.str)
.JoinWith("");
var expected = ", ".Join(possibleNext.Select(p => p.description));
var actual = (gc.endOfFile ? "" : "grapheme cluster ") + gc.Description();
var cat = gc.codePoints
.First()
.Match(Some: x => x.UnicodeCategory(0).ToString(),
None: "None (empty string)");
return new ParserErrorException(
$"Unexpected {actual} (Unicode category {cat}) while the lexer was in state {state}: expected one of {expected}{Environment.NewLine}{strContext} <--HERE {strRest}"
);
}
// fake Unicode category
private const UnicodeCategory EndOfFile = (UnicodeCategory)(-1);
public static U Foo<T, U>(T x, Func<T, U> f) => f(x);
[F]
private partial class Flub {
public ValueTuple<ValueTuple<string, S, IImmutableEnumerator<GraphemeCluster>>, IImmutableEnumerator<Lexeme>> F(
ValueTuple<string, S, IImmutableEnumerator<GraphemeCluster>> t,
ValueTuple<GraphemeCluster, IImmutableEnumerator<GraphemeCluster>> cur
)
{
var (lexeme, state, context) = t;
var (c, current) = cur;
var possibleNext = Rules.WithEpsilonTransitions[state];
return
possibleNext
.First(r => r.test(c))
.IfSome(rule => {
var r = Transition(state, lexeme, c, rule);
var newState = r.Item1;
var newLexeme = r.Item2;
var tokens = r.Item3;
return ((newLexeme, newState, context), tokens);
})
.ElseThrow(() => ParserError(context, current, state, possibleNext, c));
}
}
public static IImmutableEnumerator<IImmutableEnumerator<Lexeme>> Lex2(IImmutableEnumerator<GraphemeCluster> ie) {
var lexeme = "";
var state = S.Space;
// In a REPL we could reset the context at the end of each statement.
// We could also reset the context to the containing line or function when processing files.
var context = ie;
return ie.SelectAggregate((lexeme, state, context), Flub.Eq);
}
public static IImmutableEnumerator<IImmutableEnumerator<Lexeme>>
Lex1(string source)
=> Lex2(source.TextElements().GetImmutableEnumerator());
[F]
private partial class SkipInitialEmptyWhitespace {
public IImmutableEnumerator<Lexeme> F(
IImmutableEnumerator<Lexeme> lx
)
=> lx.FirstAndRest().Match(
Some: hdtl =>
// skip the initial empty whitespace
"".Equals(hdtl.Item1.lexeme)
? hdtl.Item2
: hdtl.Item1.ImSingleton().Concat(hdtl.Item2),
None: Empty<Lexeme>());
}
// TODO: move this to a .Filter() extension method.
[F]
private partial class DiscardWhitespace {
public IImmutableEnumerator<Lexeme> F(
IImmutableEnumerator<Lexeme> lx
)
=> lx.FirstAndRest().Match<Tuple<Lexeme, IImmutableEnumerator<Lexeme>>, IImmutableEnumerator<Lexeme>>(
Some: hdtl => {
var rest = hdtl.Item2.Lazy(DiscardWhitespace.Eq);
return hdtl.Item1.state.Equals(S.Space)
? rest
: hdtl.Item1.ImSingleton().Concat(rest);
},
None: Empty<Lexeme>());
}
public static IImmutableEnumerator<Lexeme> Lex(string source)
=> new Lexeme(S.StartOfInput, "").ImSingleton()
.Concat(
Lex1(source)
.Flatten()
//.Lazy(SkipInitialEmptyWhitespace.Eq)
.Lazy(DiscardWhitespace.Eq));
}