Unicode lexer & compilation to JS of strings and of numbers (previously a placeholder).
This commit is contained in:
parent
3f23248c14
commit
9a885195c2
|
@ -1,7 +1,12 @@
|
|||
namespace Compilers {
|
||||
public class JS {
|
||||
public static string Compile(Ast.Expr source) {
|
||||
return "process.stdout.write('42');";
|
||||
return "process.stdout.write(String("
|
||||
+ source.Match(
|
||||
Int: i => i.ToString(),
|
||||
String: s => $"'{s.ToString()}'"
|
||||
)
|
||||
+ "));";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -5,4 +5,7 @@ public static class Global {
|
|||
public static Exe Exe (string str) => new Exe (str);
|
||||
|
||||
public static void Log (string str) => System.Console.WriteLine(str);
|
||||
|
||||
public static System.Text.StringBuilder Append(System.Text.StringBuilder b, string s)
|
||||
=> b.Append(s);
|
||||
}
|
140
Parser.cs
140
Parser.cs
|
@ -1,6 +1,140 @@
|
|||
using Ast;
|
||||
using System;
|
||||
using System.Text;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Globalization;
|
||||
using C = System.Globalization.UnicodeCategory;
|
||||
using static Global;
|
||||
|
||||
public static class Parser {
|
||||
public static Expr Parse(string source) {
|
||||
return new Int(42);
|
||||
public enum S {
|
||||
End,
|
||||
Space,
|
||||
Int,
|
||||
Decimal,
|
||||
String,
|
||||
}
|
||||
|
||||
public struct Lexeme {
|
||||
public readonly S state;
|
||||
// TODO: maybe keep this as a list of grapheme clusters
|
||||
public readonly string lexeme;
|
||||
public Lexeme(S state, string lexeme) {
|
||||
this.state = state;
|
||||
this.lexeme = lexeme;
|
||||
}
|
||||
public override string ToString() {
|
||||
return $"new Lexeme({state}, \"{lexeme}\")";
|
||||
}
|
||||
}
|
||||
|
||||
// Transition
|
||||
private static IEnumerable<Lexeme> T(ref S state, ref string lexeme, GraphemeCluster c, S newState) {
|
||||
if (newState != state) {
|
||||
var toReturn = new Lexeme(state, lexeme);
|
||||
state = newState;
|
||||
lexeme = "";
|
||||
lexeme += c.str;
|
||||
return toReturn.Singleton();
|
||||
} else {
|
||||
lexeme += c.str;
|
||||
return Enumerable.Empty<Lexeme>();
|
||||
}
|
||||
}
|
||||
|
||||
public static void ParseError(StringBuilder context, IEnumerator<GraphemeCluster> stream) {
|
||||
var rest =
|
||||
stream
|
||||
.SingleUseEnumerable()
|
||||
.TakeUntil(c => c.str.StartsWith("\n"))
|
||||
.Select(c => c.str)
|
||||
.Aggregate(new StringBuilder(), Append);
|
||||
|
||||
throw new Exception(
|
||||
$"Cannot parse this:{Environment.NewLine}{context}__HERE:__{rest}"
|
||||
);
|
||||
}
|
||||
|
||||
// fake Unicode category
|
||||
private const UnicodeCategory EndOfFile = (UnicodeCategory)(-1);
|
||||
|
||||
public static IEnumerable<IEnumerable<Lexeme>> Lex1(string source) {
|
||||
var context = new StringBuilder();
|
||||
var lexeme = "";
|
||||
var state = S.Space;
|
||||
var e = source.TextElements().GetEnumerator();
|
||||
while (e.MoveNext()) {
|
||||
var c = e.Current;
|
||||
context.Append(c.str);
|
||||
var charCategory =
|
||||
c.endOfFile
|
||||
? EndOfFile
|
||||
: Char.GetUnicodeCategory(c.codePoints.First(), 0);
|
||||
switch (state) {
|
||||
case S.Space:
|
||||
{
|
||||
switch (charCategory) {
|
||||
case C.DecimalDigitNumber:
|
||||
yield return T(ref state, ref lexeme, c, S.Int);
|
||||
break;
|
||||
case C.SpaceSeparator:
|
||||
yield return T(ref state, ref lexeme, c, S.Space);
|
||||
break;
|
||||
case EndOfFile:
|
||||
yield return T(ref state, ref lexeme, c, S.End);
|
||||
break;
|
||||
default:
|
||||
ParseError(context, e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case S.Int:
|
||||
{
|
||||
switch (charCategory) {
|
||||
case C.DecimalDigitNumber:
|
||||
yield return T(ref state, ref lexeme, c, S.Int);
|
||||
break;
|
||||
case C.SpaceSeparator:
|
||||
yield return T(ref state, ref lexeme, c, S.Space);
|
||||
break;
|
||||
case EndOfFile:
|
||||
yield return T(ref state, ref lexeme, c, S.End);
|
||||
break;
|
||||
default:
|
||||
ParseError(context, e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static IEnumerable<Lexeme> Lex(string source) {
|
||||
var first = true;
|
||||
foreach (var x in Lex1(source).SelectMany(x => x)) {
|
||||
if (first && "".Equals(x.lexeme)) {
|
||||
// skip the initial empty whitespace
|
||||
} else {
|
||||
first = false;
|
||||
yield return x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static Ast.Expr Parse(string source) {
|
||||
foreach (var lexeme in Lex(source)) {
|
||||
switch (lexeme.state) {
|
||||
case S.Int:
|
||||
return new Ast.Int(Int32.Parse(lexeme.lexeme));
|
||||
case S.String:
|
||||
return new Ast.String(lexeme.lexeme);
|
||||
default:
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
throw new Exception("empty file, rm this when consuming the whole stream of lexemes.");
|
||||
}
|
||||
}
|
1
Tests/002-43.e
Normal file
1
Tests/002-43.e
Normal file
|
@ -0,0 +1 @@
|
|||
43
|
1
Tests/002-43.o
Normal file
1
Tests/002-43.o
Normal file
|
@ -0,0 +1 @@
|
|||
43
|
9
Utils/Char.cs
Normal file
9
Utils/Char.cs
Normal file
|
@ -0,0 +1,9 @@
|
|||
using System;
|
||||
|
||||
public static class CharExtensionMethods {
|
||||
public static bool IsHighSurrogate(this Char c)
|
||||
=> Char.IsHighSurrogate(c);
|
||||
|
||||
public static bool IsLowSurrogate(this Char c)
|
||||
=> Char.IsLowSurrogate(c);
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using System.Collections.Generic;
|
||||
|
||||
public static class Collection {
|
||||
public static void ForEach<T>(this IEnumerable<T> x, Action<T> f)
|
||||
=> x.ToList().ForEach(f);
|
||||
|
||||
/*
|
||||
public static ListI<Tuple<T,U>> Add<T,U>(this ListI<Tuple<T,U>> l, T x, U y)
|
||||
=> l.Add(Tuple.Create(x,y));
|
||||
*/
|
||||
|
||||
// System.Collections.Immutable requires NuGet and is not available on repl.it
|
||||
public static List<T> Cons<T>(this List<T> l, T x) { l.Add(x); return l; }
|
||||
|
||||
// Circumvent bug with collection initializers, tuples and
|
||||
// first-class functions by using repeated .Add()
|
||||
// See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs
|
||||
|
||||
public static List<Tuple<T,U>> Cons<T,U>(this List<Tuple<T,U>> l, T x, U y)
|
||||
=> l.Cons(Tuple.Create(x,y));
|
||||
|
||||
public static List<Tuple<T,U,V>> Cons<T,U,V>(this List<Tuple<T,U,V>> l, T x, U y, V z)
|
||||
=> l.Cons(Tuple.Create(x,y,z));
|
||||
|
||||
public static void Deconstruct<A, B>(this Tuple<A, B> t, out A a, out B b) {
|
||||
a = t.Item1;
|
||||
b = t.Item2;
|
||||
}
|
||||
}
|
114
Utils/Enumerable.cs
Normal file
114
Utils/Enumerable.cs
Normal file
|
@ -0,0 +1,114 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using System.Collections.Generic;
|
||||
|
||||
public static class Collection {
|
||||
public static void ForEach<T>(this IEnumerable<T> x, Action<T> f)
|
||||
=> x.ToList().ForEach(f);
|
||||
|
||||
/*
|
||||
public static ListI<Tuple<T,U>> Add<T,U>(this ListI<Tuple<T,U>> l, T x, U y)
|
||||
=> l.Add(Tuple.Create(x,y));
|
||||
*/
|
||||
|
||||
// System.Collections.Immutable requires NuGet and is not available on repl.it
|
||||
public static List<T> Cons<T>(this List<T> l, T x) { l.Add(x); return l; }
|
||||
|
||||
// Circumvent bug with collection initializers, tuples and
|
||||
// first-class functions by using repeated .Add()
|
||||
// See https://repl.it/@suzannesoy/WarlikeWorstTraining#main.cs
|
||||
|
||||
public static List<Tuple<T,U>> Cons<T,U>(this List<Tuple<T,U>> l, T x, U y)
|
||||
=> l.Cons(Tuple.Create(x,y));
|
||||
|
||||
public static List<Tuple<T,U,V>> Cons<T,U,V>(this List<Tuple<T,U,V>> l, T x, U y, V z)
|
||||
=> l.Cons(Tuple.Create(x,y,z));
|
||||
|
||||
public static void Deconstruct<A, B>(this Tuple<A, B> t, out A a, out B b) {
|
||||
a = t.Item1;
|
||||
b = t.Item2;
|
||||
}
|
||||
|
||||
public struct Item<T> {
|
||||
public readonly T item;
|
||||
public readonly long index;
|
||||
public readonly bool first;
|
||||
public readonly bool last;
|
||||
public Item(T item, long index, bool first, bool last) {
|
||||
this.item = item;
|
||||
this.index = index;
|
||||
this.first = first;
|
||||
this.last = last;
|
||||
}
|
||||
}
|
||||
|
||||
public static IEnumerable<Item<T>> Indexed<T>(this IEnumerable<T> e) {
|
||||
long i = 0L;
|
||||
bool first = true;
|
||||
T prevX = default(T); // Dummy
|
||||
long prevI = default(long);
|
||||
bool prevFirst = default(bool);
|
||||
foreach (var x in e) {
|
||||
if (!first) {
|
||||
yield return new Item<T>(prevX, prevI, prevFirst, false);
|
||||
}
|
||||
prevX = x;
|
||||
prevI = i;
|
||||
prevFirst = first;
|
||||
first = false;
|
||||
i++;
|
||||
}
|
||||
if (!first) {
|
||||
yield return new Item<T>(prevX, prevI, prevFirst, true);
|
||||
}
|
||||
}
|
||||
|
||||
public struct Peekable<T> : IEnumerator<T>, System.Collections.IEnumerator {
|
||||
private IEnumerator<T> e;
|
||||
private bool peeked;
|
||||
private T previous;
|
||||
public T Current { get => peeked ? previous : e.Current; }
|
||||
object System.Collections.IEnumerator.Current {
|
||||
get => this.Current;
|
||||
}
|
||||
public bool MoveNext() {
|
||||
this.peeked = false;
|
||||
this.previous = default(T);
|
||||
return this.e.MoveNext();
|
||||
}
|
||||
public bool Peek() {
|
||||
if (this.peeked) {
|
||||
throw new Exception("Already peeked once");
|
||||
} else {
|
||||
this.previous = e.Current;
|
||||
this.peeked = true;
|
||||
return this.e.MoveNext();
|
||||
}
|
||||
}
|
||||
public void Dispose() { e.Dispose(); }
|
||||
public void Reset() { e.Reset(); }
|
||||
|
||||
public Peekable(IEnumerable<T> e) {
|
||||
this.e = e.GetEnumerator();
|
||||
this.peeked = false;
|
||||
this.previous = default(T);
|
||||
}
|
||||
}
|
||||
|
||||
public static Peekable<T> Peekable<T>(this IEnumerable<T> e) {
|
||||
return new Peekable<T>(e);
|
||||
}
|
||||
|
||||
public static IEnumerable<T> SingleUseEnumerable<T>(this IEnumerator<T> e) {
|
||||
while (e.MoveNext()) {
|
||||
yield return e.Current;
|
||||
}
|
||||
}
|
||||
|
||||
public static IEnumerable<T> TakeUntil<T>(this IEnumerable<T> e, Func<T, bool> f)
|
||||
=> e.TakeWhile(x => !f(x));
|
||||
|
||||
public static IEnumerable<T> Singleton<T>(this T x) {
|
||||
yield return x;
|
||||
}
|
||||
}
|
89
Utils/Unicode.cs
Normal file
89
Utils/Unicode.cs
Normal file
|
@ -0,0 +1,89 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
|
||||
public struct GraphemeCluster {
|
||||
public readonly bool endOfFile;
|
||||
public readonly string str;
|
||||
public readonly IEnumerable<string> codePoints;
|
||||
public GraphemeCluster(bool endOfFile, string str, IEnumerable<string> codePoints) {
|
||||
this.endOfFile = endOfFile;
|
||||
this.str = str;
|
||||
this.codePoints = codePoints;
|
||||
}
|
||||
}
|
||||
|
||||
public static class UnicodeExtensionMethods {
|
||||
public static IEnumerable<string> SplitOnSurrogatePairs(this IEnumerable<char> s) {
|
||||
var e = s.GetEnumerator();
|
||||
while (e.MoveNext()) {
|
||||
var firstOfPossiblePair = e.Current;
|
||||
if (firstOfPossiblePair.IsHighSurrogate()) {
|
||||
e.MoveNext();
|
||||
if (e.Current.IsLowSurrogate()) {
|
||||
yield return firstOfPossiblePair.ToString()
|
||||
+ e.Current.ToString();
|
||||
} else {
|
||||
throw new ArgumentException("This UTF-16 string seems malformed: found a high surrogate at the end of the input.");
|
||||
}
|
||||
} else {
|
||||
yield return firstOfPossiblePair.ToString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int ToUtf32(this string s, int pos)
|
||||
=> Char.ConvertToUtf32(s, pos);
|
||||
|
||||
public static IEnumerable<GraphemeCluster> TextElements(this string s) {
|
||||
// in: "1\u22152e\u0301\u0327a"
|
||||
// out: [["1"], ["\u2215"], ["2"], [e, "\u0301", "\u0327"], "a"]]
|
||||
// i.e. "1∕2ȩ́a"
|
||||
// becomes [["1"], ["∕"], ["2"], ["e", "◌́", "◌̧̧"̧], ["a"]]
|
||||
// TODO: also groups flag emojis based on unicode "tags" as a single element
|
||||
var e = StringInfo.GetTextElementEnumerator(s);
|
||||
var alreadyMoved = false;
|
||||
while (alreadyMoved || e.MoveNext()) {
|
||||
alreadyMoved = false;
|
||||
// TODO: check whether UTF-16 allows for different
|
||||
// encodings for the same code point and if so how
|
||||
// to compare them correctly.
|
||||
var te = e.Current.ToString();
|
||||
var wavingBlackFlag = 0x1F3F4;
|
||||
// TODO: check the role of "begin" for tag sequences.
|
||||
var begin = 0xE0001;
|
||||
// All the characters between sp and cancelTag are valid tag characters
|
||||
var sp = 0xE0020;
|
||||
var cancelTag = 0xE007F;
|
||||
var first = te.ToUtf32(0);
|
||||
// TODO: te.length is hardcoded as 2 because the tag
|
||||
// code points all require a surrogate pair (i.e. don't
|
||||
// fit in a single UTF-16 element).
|
||||
if (te.Length == 2 && first == wavingBlackFlag || first == begin) {
|
||||
while (e.MoveNext()) {
|
||||
var te2 = e.Current.ToString();
|
||||
var first2 = te2.ToUtf32(0);
|
||||
if (te2.Length == 2 && first2 >= sp && first2 <= cancelTag) {
|
||||
te += te2;
|
||||
if (first2 == cancelTag) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
alreadyMoved = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
yield return new GraphemeCluster(
|
||||
false,
|
||||
te,
|
||||
te.SplitOnSurrogatePairs()
|
||||
);
|
||||
}
|
||||
yield return new GraphemeCluster(
|
||||
true,
|
||||
"",
|
||||
Enumerable.Empty<string>()
|
||||
);
|
||||
}
|
||||
}
|
6
main.cs
6
main.cs
|
@ -28,9 +28,11 @@ public static class MainClass {
|
|||
|
||||
CompileToFile(compile, sourcePath, destPath);
|
||||
|
||||
if (runner.Run(destPath) != expected.Read()) {
|
||||
var actualStr = runner.Run(destPath);
|
||||
var expectedStr = expected.Read();
|
||||
if (actualStr != expectedStr) {
|
||||
Console.WriteLine("\x1b[1;31mFail\x1b[m");
|
||||
throw new Exception("Test failed " + source);
|
||||
throw new Exception($"Test failed {source}: expected {expectedStr} but got {actualStr}.");
|
||||
} else {
|
||||
Console.WriteLine("\x1b[1;32mOK\x1b[m");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user