Unicode lexer & compilation to JS of strings and of numbers (previously a placeholder).

This commit is contained in:
Suzanne Soy 2020-08-14 05:03:02 +00:00
parent 3f23248c14
commit 9a885195c2
10 changed files with 364 additions and 37 deletions

View File

@ -1,7 +1,12 @@
namespace Compilers { namespace Compilers {
public class JS { public class JS {
public static string Compile(Ast.Expr source) { public static string Compile(Ast.Expr source) {
return "process.stdout.write('42');"; return "process.stdout.write(String("
+ source.Match(
Int: i => i.ToString(),
String: s => $"'{s.ToString()}'"
)
+ "));";
} }
} }
} }

View File

@ -5,4 +5,7 @@ public static class Global {
public static Exe Exe (string str) => new Exe (str); public static Exe Exe (string str) => new Exe (str);
public static void Log (string str) => System.Console.WriteLine(str); public static void Log (string str) => System.Console.WriteLine(str);
public static System.Text.StringBuilder Append(System.Text.StringBuilder b, string s)
=> b.Append(s);
} }

140
Parser.cs
View File

@ -1,6 +1,140 @@
using Ast; using System;
using System.Text;
using System.Collections.Generic;
using System.Linq;
using System.Globalization;
using C = System.Globalization.UnicodeCategory;
using static Global;
public static class Parser { public static class Parser {
public static Expr Parse(string source) { public enum S {
return new Int(42); End,
Space,
Int,
Decimal,
String,
}
public struct Lexeme {
public readonly S state;
// TODO: maybe keep this as a list of grapheme clusters
public readonly string lexeme;
public Lexeme(S state, string lexeme) {
this.state = state;
this.lexeme = lexeme;
}
public override string ToString() {
return $"new Lexeme({state}, \"{lexeme}\")";
}
}
// Transition
private static IEnumerable<Lexeme> T(ref S state, ref string lexeme, GraphemeCluster c, S newState) {
if (newState != state) {
var toReturn = new Lexeme(state, lexeme);
state = newState;
lexeme = "";
lexeme += c.str;
return toReturn.Singleton();
} else {
lexeme += c.str;
return Enumerable.Empty<Lexeme>();
}
}
public static void ParseError(StringBuilder context, IEnumerator<GraphemeCluster> stream) {
var rest =
stream
.SingleUseEnumerable()
.TakeUntil(c => c.str.StartsWith("\n"))
.Select(c => c.str)
.Aggregate(new StringBuilder(), Append);
throw new Exception(
$"Cannot parse this:{Environment.NewLine}{context}__HERE:__{rest}"
);
}
// fake Unicode category
private const UnicodeCategory EndOfFile = (UnicodeCategory)(-1);
public static IEnumerable<IEnumerable<Lexeme>> Lex1(string source) {
var context = new StringBuilder();
var lexeme = "";
var state = S.Space;
var e = source.TextElements().GetEnumerator();
while (e.MoveNext()) {
var c = e.Current;
context.Append(c.str);
var charCategory =
c.endOfFile
? EndOfFile
: Char.GetUnicodeCategory(c.codePoints.First(), 0);
switch (state) {
case S.Space:
{
switch (charCategory) {
case C.DecimalDigitNumber:
yield return T(ref state, ref lexeme, c, S.Int);
break;
case C.SpaceSeparator:
yield return T(ref state, ref lexeme, c, S.Space);
break;
case EndOfFile:
yield return T(ref state, ref lexeme, c, S.End);
break;
default:
ParseError(context, e);
break;
}
}
break;
case S.Int:
{
switch (charCategory) {
case C.DecimalDigitNumber:
yield return T(ref state, ref lexeme, c, S.Int);
break;
case C.SpaceSeparator:
yield return T(ref state, ref lexeme, c, S.Space);
break;
case EndOfFile:
yield return T(ref state, ref lexeme, c, S.End);
break;
default:
ParseError(context, e);
break;
}
}
break;
}
}
}
public static IEnumerable<Lexeme> Lex(string source) {
var first = true;
foreach (var x in Lex1(source).SelectMany(x => x)) {
if (first && "".Equals(x.lexeme)) {
// skip the initial empty whitespace
} else {
first = false;
yield return x;
}
}
}
public static Ast.Expr Parse(string source) {
foreach (var lexeme in Lex(source)) {
switch (lexeme.state) {
case S.Int:
return new Ast.Int(Int32.Parse(lexeme.lexeme));
case S.String:
return new Ast.String(lexeme.lexeme);
default:
throw new NotImplementedException();
}
}
throw new Exception("empty file, rm this when consuming the whole stream of lexemes.");
} }
} }

1
Tests/002-43.e Normal file
View File

@ -0,0 +1 @@
43

1
Tests/002-43.o Normal file
View File

@ -0,0 +1 @@
43

9
Utils/Char.cs Normal file
View File

@ -0,0 +1,9 @@
using System;
public static class CharExtensionMethods {
public static bool IsHighSurrogate(this Char c)
=> Char.IsHighSurrogate(c);
public static bool IsLowSurrogate(this Char c)
=> Char.IsLowSurrogate(c);
}

View File

@ -1,31 +0,0 @@
using System;
using System.Linq;
using System.Collections.Generic;
public static class Collection {
public static void ForEach<T>(this IEnumerable<T> x, Action<T> f)
=> x.ToList().ForEach(f);
/*
public static ListI<Tuple<T,U>> Add<T,U>(this ListI<Tuple<T,U>> l, T x, U y)
=> l.Add(Tuple.Create(x,y));
*/
// System.Collections.Immutable requires NuGet and is not available on repl.it
public static List<T> Cons<T>(this List<T> l, T x) { l.Add(x); return l; }
// Circumvent bug with collection initializers, tuples and
// first-class functions by using repeated .Add()
// See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs
public static List<Tuple<T,U>> Cons<T,U>(this List<Tuple<T,U>> l, T x, U y)
=> l.Cons(Tuple.Create(x,y));
public static List<Tuple<T,U,V>> Cons<T,U,V>(this List<Tuple<T,U,V>> l, T x, U y, V z)
=> l.Cons(Tuple.Create(x,y,z));
public static void Deconstruct<A, B>(this Tuple<A, B> t, out A a, out B b) {
a = t.Item1;
b = t.Item2;
}
}

114
Utils/Enumerable.cs Normal file
View File

@ -0,0 +1,114 @@
using System;
using System.Linq;
using System.Collections.Generic;
public static class Collection {
public static void ForEach<T>(this IEnumerable<T> x, Action<T> f)
=> x.ToList().ForEach(f);
/*
public static ListI<Tuple<T,U>> Add<T,U>(this ListI<Tuple<T,U>> l, T x, U y)
=> l.Add(Tuple.Create(x,y));
*/
// System.Collections.Immutable requires NuGet and is not available on repl.it
public static List<T> Cons<T>(this List<T> l, T x) { l.Add(x); return l; }
// Circumvent bug with collection initializers, tuples and
// first-class functions by using repeated .Add()
// See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs
public static List<Tuple<T,U>> Cons<T,U>(this List<Tuple<T,U>> l, T x, U y)
=> l.Cons(Tuple.Create(x,y));
public static List<Tuple<T,U,V>> Cons<T,U,V>(this List<Tuple<T,U,V>> l, T x, U y, V z)
=> l.Cons(Tuple.Create(x,y,z));
public static void Deconstruct<A, B>(this Tuple<A, B> t, out A a, out B b) {
a = t.Item1;
b = t.Item2;
}
public struct Item<T> {
public readonly T item;
public readonly long index;
public readonly bool first;
public readonly bool last;
public Item(T item, long index, bool first, bool last) {
this.item = item;
this.index = index;
this.first = first;
this.last = last;
}
}
public static IEnumerable<Item<T>> Indexed<T>(this IEnumerable<T> e) {
long i = 0L;
bool first = true;
T prevX = default(T); // Dummy
long prevI = default(long);
bool prevFirst = default(bool);
foreach (var x in e) {
if (!first) {
yield return new Item<T>(prevX, prevI, prevFirst, false);
}
prevX = x;
prevI = i;
prevFirst = first;
first = false;
i++;
}
if (!first) {
yield return new Item<T>(prevX, prevI, prevFirst, true);
}
}
public struct Peekable<T> : IEnumerator<T>, System.Collections.IEnumerator {
private IEnumerator<T> e;
private bool peeked;
private T previous;
public T Current { get => peeked ? previous : e.Current; }
object System.Collections.IEnumerator.Current {
get => this.Current;
}
public bool MoveNext() {
this.peeked = false;
this.previous = default(T);
return this.e.MoveNext();
}
public bool Peek() {
if (this.peeked) {
throw new Exception("Already peeked once");
} else {
this.previous = e.Current;
this.peeked = true;
return this.e.MoveNext();
}
}
public void Dispose() { e.Dispose(); }
public void Reset() { e.Reset(); }
public Peekable(IEnumerable<T> e) {
this.e = e.GetEnumerator();
this.peeked = false;
this.previous = default(T);
}
}
public static Peekable<T> Peekable<T>(this IEnumerable<T> e) {
return new Peekable<T>(e);
}
public static IEnumerable<T> SingleUseEnumerable<T>(this IEnumerator<T> e) {
while (e.MoveNext()) {
yield return e.Current;
}
}
public static IEnumerable<T> TakeUntil<T>(this IEnumerable<T> e, Func<T, bool> f)
=> e.TakeWhile(x => !f(x));
public static IEnumerable<T> Singleton<T>(this T x) {
yield return x;
}
}

89
Utils/Unicode.cs Normal file
View File

@ -0,0 +1,89 @@
using System;
using System.Linq;
using System.Collections.Generic;
using System.Globalization;
public struct GraphemeCluster {
public readonly bool endOfFile;
public readonly string str;
public readonly IEnumerable<string> codePoints;
public GraphemeCluster(bool endOfFile, string str, IEnumerable<string> codePoints) {
this.endOfFile = endOfFile;
this.str = str;
this.codePoints = codePoints;
}
}
public static class UnicodeExtensionMethods {
public static IEnumerable<string> SplitOnSurrogatePairs(this IEnumerable<char> s) {
var e = s.GetEnumerator();
while (e.MoveNext()) {
var firstOfPossiblePair = e.Current;
if (firstOfPossiblePair.IsHighSurrogate()) {
e.MoveNext();
if (e.Current.IsLowSurrogate()) {
yield return firstOfPossiblePair.ToString()
+ e.Current.ToString();
} else {
throw new ArgumentException("This UTF-16 string seems malformed: found a high surrogate at the end of the input.");
}
} else {
yield return firstOfPossiblePair.ToString();
}
}
}
public static int ToUtf32(this string s, int pos)
=> Char.ConvertToUtf32(s, pos);
public static IEnumerable<GraphemeCluster> TextElements(this string s) {
// in: "1\u22152e\u0301\u0327a"
// out: [["1"], ["\u2215"], ["2"], [e, "\u0301", "\u0327"], "a"]]
// i.e. "12ȩ́a"
// becomes [["1"], [""], ["2"], ["e", "◌́", "◌̧̧"̧], ["a"]]
// TODO: also groups flag emojis based on unicode "tags" as a single element
var e = StringInfo.GetTextElementEnumerator(s);
var alreadyMoved = false;
while (alreadyMoved || e.MoveNext()) {
alreadyMoved = false;
// TODO: check whether UTF-16 allows for different
// encodings for the same code point and if so how
// to compare them correctly.
var te = e.Current.ToString();
var wavingBlackFlag = 0x1F3F4;
// TODO: check the role of "begin" for tag sequences.
var begin = 0xE0001;
// All the characters between sp and cancelTag are valid tag characters
var sp = 0xE0020;
var cancelTag = 0xE007F;
var first = te.ToUtf32(0);
// TODO: te.length is hardcoded as 2 because the tag
// code points all require a surrogate pair (i.e. don't
// fit in a single UTF-16 element).
if (te.Length == 2 && first == wavingBlackFlag || first == begin) {
while (e.MoveNext()) {
var te2 = e.Current.ToString();
var first2 = te2.ToUtf32(0);
if (te2.Length == 2 && first2 >= sp && first2 <= cancelTag) {
te += te2;
if (first2 == cancelTag) {
break;
}
} else {
alreadyMoved = true;
}
}
}
yield return new GraphemeCluster(
false,
te,
te.SplitOnSurrogatePairs()
);
}
yield return new GraphemeCluster(
true,
"",
Enumerable.Empty<string>()
);
}
}

View File

@ -28,9 +28,11 @@ public static class MainClass {
CompileToFile(compile, sourcePath, destPath); CompileToFile(compile, sourcePath, destPath);
if (runner.Run(destPath) != expected.Read()) { var actualStr = runner.Run(destPath);
var expectedStr = expected.Read();
if (actualStr != expectedStr) {
Console.WriteLine("\x1b[1;31mFail\x1b[m"); Console.WriteLine("\x1b[1;31mFail\x1b[m");
throw new Exception("Test failed " + source); throw new Exception($"Test failed {source}: expected {expectedStr} but got {actualStr}.");
} else { } else {
Console.WriteLine("\x1b[1;32mOK\x1b[m"); Console.WriteLine("\x1b[1;32mOK\x1b[m");
} }