From 9a885195c2dc0e7d918d27bd4f1fd232457674d2 Mon Sep 17 00:00:00 2001 From: Suzanne Soy Date: Fri, 14 Aug 2020 05:03:02 +0000 Subject: [PATCH] Unicode lexer & compilation to JS of strings and of numbers (previously a placeholder). --- Compilers/JS.cs | 7 ++- Global.cs | 3 + Parser.cs | 140 +++++++++++++++++++++++++++++++++++++++++++- Tests/002-43.e | 1 + Tests/002-43.o | 1 + Utils/Char.cs | 9 +++ Utils/Collection.cs | 31 ---------- Utils/Enumerable.cs | 114 ++++++++++++++++++++++++++++++++++++ Utils/Unicode.cs | 89 ++++++++++++++++++++++++++++ main.cs | 6 +- 10 files changed, 364 insertions(+), 37 deletions(-) create mode 100644 Tests/002-43.e create mode 100644 Tests/002-43.o create mode 100644 Utils/Char.cs delete mode 100644 Utils/Collection.cs create mode 100644 Utils/Enumerable.cs create mode 100644 Utils/Unicode.cs diff --git a/Compilers/JS.cs b/Compilers/JS.cs index 5c53996..40eb90b 100644 --- a/Compilers/JS.cs +++ b/Compilers/JS.cs @@ -1,7 +1,12 @@ namespace Compilers { public class JS { public static string Compile(Ast.Expr source) { - return "process.stdout.write('42');"; + return "process.stdout.write(String(" + + source.Match( + Int: i => i.ToString(), + String: s => $"'{s.ToString()}'" + ) + + "));"; } } } \ No newline at end of file diff --git a/Global.cs b/Global.cs index b6dc4d5..a9cea0f 100644 --- a/Global.cs +++ b/Global.cs @@ -5,4 +5,7 @@ public static class Global { public static Exe Exe (string str) => new Exe (str); public static void Log (string str) => System.Console.WriteLine(str); + + public static System.Text.StringBuilder Append(System.Text.StringBuilder b, string s) + => b.Append(s); } \ No newline at end of file diff --git a/Parser.cs b/Parser.cs index f03ccc9..6811f7e 100644 --- a/Parser.cs +++ b/Parser.cs @@ -1,6 +1,140 @@ -using Ast; +using System; +using System.Text; +using System.Collections.Generic; +using System.Linq; +using System.Globalization; +using C = System.Globalization.UnicodeCategory; +using static Global; + public static class Parser { - public static Expr Parse(string source) { - return new Int(42); + public enum S { + End, + Space, + Int, + Decimal, + String, + } + + public struct Lexeme { + public readonly S state; + // TODO: maybe keep this as a list of grapheme clusters + public readonly string lexeme; + public Lexeme(S state, string lexeme) { + this.state = state; + this.lexeme = lexeme; + } + public override string ToString() { + return $"new Lexeme({state}, \"{lexeme}\")"; + } + } + + // Transition + private static IEnumerable T(ref S state, ref string lexeme, GraphemeCluster c, S newState) { + if (newState != state) { + var toReturn = new Lexeme(state, lexeme); + state = newState; + lexeme = ""; + lexeme += c.str; + return toReturn.Singleton(); + } else { + lexeme += c.str; + return Enumerable.Empty(); + } + } + + public static void ParseError(StringBuilder context, IEnumerator stream) { + var rest = + stream + .SingleUseEnumerable() + .TakeUntil(c => c.str.StartsWith("\n")) + .Select(c => c.str) + .Aggregate(new StringBuilder(), Append); + + throw new Exception( + $"Cannot parse this:{Environment.NewLine}{context}__HERE:__{rest}" + ); + } + + // fake Unicode category + private const UnicodeCategory EndOfFile = (UnicodeCategory)(-1); + + public static IEnumerable> Lex1(string source) { + var context = new StringBuilder(); + var lexeme = ""; + var state = S.Space; + var e = source.TextElements().GetEnumerator(); + while (e.MoveNext()) { + var c = e.Current; + context.Append(c.str); + var charCategory = + c.endOfFile + ? EndOfFile + : Char.GetUnicodeCategory(c.codePoints.First(), 0); + switch (state) { + case S.Space: + { + switch (charCategory) { + case C.DecimalDigitNumber: + yield return T(ref state, ref lexeme, c, S.Int); + break; + case C.SpaceSeparator: + yield return T(ref state, ref lexeme, c, S.Space); + break; + case EndOfFile: + yield return T(ref state, ref lexeme, c, S.End); + break; + default: + ParseError(context, e); + break; + } + } + break; + + case S.Int: + { + switch (charCategory) { + case C.DecimalDigitNumber: + yield return T(ref state, ref lexeme, c, S.Int); + break; + case C.SpaceSeparator: + yield return T(ref state, ref lexeme, c, S.Space); + break; + case EndOfFile: + yield return T(ref state, ref lexeme, c, S.End); + break; + default: + ParseError(context, e); + break; + } + } + break; + } + } + } + + public static IEnumerable Lex(string source) { + var first = true; + foreach (var x in Lex1(source).SelectMany(x => x)) { + if (first && "".Equals(x.lexeme)) { + // skip the initial empty whitespace + } else { + first = false; + yield return x; + } + } + } + + public static Ast.Expr Parse(string source) { + foreach (var lexeme in Lex(source)) { + switch (lexeme.state) { + case S.Int: + return new Ast.Int(Int32.Parse(lexeme.lexeme)); + case S.String: + return new Ast.String(lexeme.lexeme); + default: + throw new NotImplementedException(); + } + } + throw new Exception("empty file, rm this when consuming the whole stream of lexemes."); } } \ No newline at end of file diff --git a/Tests/002-43.e b/Tests/002-43.e new file mode 100644 index 0000000..ac4213d --- /dev/null +++ b/Tests/002-43.e @@ -0,0 +1 @@ +43 \ No newline at end of file diff --git a/Tests/002-43.o b/Tests/002-43.o new file mode 100644 index 0000000..ac4213d --- /dev/null +++ b/Tests/002-43.o @@ -0,0 +1 @@ +43 \ No newline at end of file diff --git a/Utils/Char.cs b/Utils/Char.cs new file mode 100644 index 0000000..13b03ea --- /dev/null +++ b/Utils/Char.cs @@ -0,0 +1,9 @@ +using System; + +public static class CharExtensionMethods { + public static bool IsHighSurrogate(this Char c) + => Char.IsHighSurrogate(c); + + public static bool IsLowSurrogate(this Char c) + => Char.IsLowSurrogate(c); +} \ No newline at end of file diff --git a/Utils/Collection.cs b/Utils/Collection.cs deleted file mode 100644 index 4d7e7b1..0000000 --- a/Utils/Collection.cs +++ /dev/null @@ -1,31 +0,0 @@ -using System; -using System.Linq; -using System.Collections.Generic; - -public static class Collection { - public static void ForEach(this IEnumerable x, Action f) - => x.ToList().ForEach(f); - - /* - public static ListI> Add(this ListI> l, T x, U y) - => l.Add(Tuple.Create(x,y)); - */ - - // System.Collections.Immutable requires NuGet and is not available on repl.it - public static List Cons(this List l, T x) { l.Add(x); return l; } - - // Circumvent bug with collection initializers, tuples and - // first-class functions by using repeated .Add() - // See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs - - public static List> Cons(this List> l, T x, U y) - => l.Cons(Tuple.Create(x,y)); - - public static List> Cons(this List> l, T x, U y, V z) - => l.Cons(Tuple.Create(x,y,z)); - - public static void Deconstruct(this Tuple t, out A a, out B b) { - a = t.Item1; - b = t.Item2; - } -} \ No newline at end of file diff --git a/Utils/Enumerable.cs b/Utils/Enumerable.cs new file mode 100644 index 0000000..8ea27d8 --- /dev/null +++ b/Utils/Enumerable.cs @@ -0,0 +1,114 @@ +using System; +using System.Linq; +using System.Collections.Generic; + +public static class Collection { + public static void ForEach(this IEnumerable x, Action f) + => x.ToList().ForEach(f); + + /* + public static ListI> Add(this ListI> l, T x, U y) + => l.Add(Tuple.Create(x,y)); + */ + + // System.Collections.Immutable requires NuGet and is not available on repl.it + public static List Cons(this List l, T x) { l.Add(x); return l; } + + // Circumvent bug with collection initializers, tuples and + // first-class functions by using repeated .Add() + // See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs + + public static List> Cons(this List> l, T x, U y) + => l.Cons(Tuple.Create(x,y)); + + public static List> Cons(this List> l, T x, U y, V z) + => l.Cons(Tuple.Create(x,y,z)); + + public static void Deconstruct(this Tuple t, out A a, out B b) { + a = t.Item1; + b = t.Item2; + } + + public struct Item { + public readonly T item; + public readonly long index; + public readonly bool first; + public readonly bool last; + public Item(T item, long index, bool first, bool last) { + this.item = item; + this.index = index; + this.first = first; + this.last = last; + } + } + + public static IEnumerable> Indexed(this IEnumerable e) { + long i = 0L; + bool first = true; + T prevX = default(T); // Dummy + long prevI = default(long); + bool prevFirst = default(bool); + foreach (var x in e) { + if (!first) { + yield return new Item(prevX, prevI, prevFirst, false); + } + prevX = x; + prevI = i; + prevFirst = first; + first = false; + i++; + } + if (!first) { + yield return new Item(prevX, prevI, prevFirst, true); + } + } + + public struct Peekable : IEnumerator, System.Collections.IEnumerator { + private IEnumerator e; + private bool peeked; + private T previous; + public T Current { get => peeked ? previous : e.Current; } + object System.Collections.IEnumerator.Current { + get => this.Current; + } + public bool MoveNext() { + this.peeked = false; + this.previous = default(T); + return this.e.MoveNext(); + } + public bool Peek() { + if (this.peeked) { + throw new Exception("Already peeked once"); + } else { + this.previous = e.Current; + this.peeked = true; + return this.e.MoveNext(); + } + } + public void Dispose() { e.Dispose(); } + public void Reset() { e.Reset(); } + + public Peekable(IEnumerable e) { + this.e = e.GetEnumerator(); + this.peeked = false; + this.previous = default(T); + } + } + + public static Peekable Peekable(this IEnumerable e) { + return new Peekable(e); + } + + public static IEnumerable SingleUseEnumerable(this IEnumerator e) { + while (e.MoveNext()) { + yield return e.Current; + } + } + + public static IEnumerable TakeUntil(this IEnumerable e, Func f) + => e.TakeWhile(x => !f(x)); + + public static IEnumerable Singleton(this T x) { + yield return x; + } +} \ No newline at end of file diff --git a/Utils/Unicode.cs b/Utils/Unicode.cs new file mode 100644 index 0000000..8dc4ac7 --- /dev/null +++ b/Utils/Unicode.cs @@ -0,0 +1,89 @@ +using System; +using System.Linq; +using System.Collections.Generic; +using System.Globalization; + +public struct GraphemeCluster { + public readonly bool endOfFile; + public readonly string str; + public readonly IEnumerable codePoints; + public GraphemeCluster(bool endOfFile, string str, IEnumerable codePoints) { + this.endOfFile = endOfFile; + this.str = str; + this.codePoints = codePoints; + } +} + +public static class UnicodeExtensionMethods { + public static IEnumerable SplitOnSurrogatePairs(this IEnumerable s) { + var e = s.GetEnumerator(); + while (e.MoveNext()) { + var firstOfPossiblePair = e.Current; + if (firstOfPossiblePair.IsHighSurrogate()) { + e.MoveNext(); + if (e.Current.IsLowSurrogate()) { + yield return firstOfPossiblePair.ToString() + + e.Current.ToString(); + } else { + throw new ArgumentException("This UTF-16 string seems malformed: found a high surrogate at the end of the input."); + } + } else { + yield return firstOfPossiblePair.ToString(); + } + } + } + + public static int ToUtf32(this string s, int pos) + => Char.ConvertToUtf32(s, pos); + + public static IEnumerable TextElements(this string s) { + // in: "1\u22152e\u0301\u0327a" + // out: [["1"], ["\u2215"], ["2"], [e, "\u0301", "\u0327"], "a"]] + // i.e. "1∕2ȩ́a" + // becomes [["1"], ["∕"], ["2"], ["e", "◌́", "◌̧̧"̧], ["a"]] + // TODO: also groups flag emojis based on unicode "tags" as a single element + var e = StringInfo.GetTextElementEnumerator(s); + var alreadyMoved = false; + while (alreadyMoved || e.MoveNext()) { + alreadyMoved = false; + // TODO: check whether UTF-16 allows for different + // encodings for the same code point and if so how + // to compare them correctly. + var te = e.Current.ToString(); + var wavingBlackFlag = 0x1F3F4; + // TODO: check the role of "begin" for tag sequences. + var begin = 0xE0001; + // All the characters between sp and cancelTag are valid tag characters + var sp = 0xE0020; + var cancelTag = 0xE007F; + var first = te.ToUtf32(0); + // TODO: te.length is hardcoded as 2 because the tag + // code points all require a surrogate pair (i.e. don't + // fit in a single UTF-16 element). + if (te.Length == 2 && first == wavingBlackFlag || first == begin) { + while (e.MoveNext()) { + var te2 = e.Current.ToString(); + var first2 = te2.ToUtf32(0); + if (te2.Length == 2 && first2 >= sp && first2 <= cancelTag) { + te += te2; + if (first2 == cancelTag) { + break; + } + } else { + alreadyMoved = true; + } + } + } + yield return new GraphemeCluster( + false, + te, + te.SplitOnSurrogatePairs() + ); + } + yield return new GraphemeCluster( + true, + "", + Enumerable.Empty() + ); + } +} \ No newline at end of file diff --git a/main.cs b/main.cs index 502c9fa..3f7f609 100644 --- a/main.cs +++ b/main.cs @@ -28,9 +28,11 @@ public static class MainClass { CompileToFile(compile, sourcePath, destPath); - if (runner.Run(destPath) != expected.Read()) { + var actualStr = runner.Run(destPath); + var expectedStr = expected.Read(); + if (actualStr != expectedStr) { Console.WriteLine("\x1b[1;31mFail\x1b[m"); - throw new Exception("Test failed " + source); + throw new Exception($"Test failed {source}: expected {expectedStr} but got {actualStr}."); } else { Console.WriteLine("\x1b[1;32mOK\x1b[m"); }