envlang-csharp/Utils/Unicode.cs

109 lines
3.6 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Linq;
using System.Collections.Generic;
using System.Globalization;
public struct GraphemeCluster {
public readonly bool endOfFile;
public readonly string str;
public readonly IEnumerable<string> codePoints;
public GraphemeCluster(bool endOfFile, string str, IEnumerable<string> codePoints) {
this.endOfFile = endOfFile;
this.str = str;
this.codePoints = codePoints;
}
public LeafLens<GraphemeCluster, Whole> ChainLens<Whole>(Func<GraphemeCluster, Whole> wrap)
=> new LeafLens<GraphemeCluster, Whole>(wrap: wrap, oldHole: this);
public static implicit operator GraphemeCluster(char c)
=> new GraphemeCluster(false, c.ToString(), c.ToString().Singleton());
}
public static class UnicodeExtensionMethods {
public static IEnumerable<string> SplitOnSurrogatePairs(this IEnumerable<char> s) {
var e = s.GetEnumerator();
while (e.MoveNext()) {
var firstOfPossiblePair = e.Current;
if (firstOfPossiblePair.IsHighSurrogate()) {
e.MoveNext();
if (e.Current.IsLowSurrogate()) {
yield return firstOfPossiblePair.ToString()
+ e.Current.ToString();
} else {
throw new ArgumentException("This UTF-16 string seems malformed: found a high surrogate at the end of the input.");
}
} else {
yield return firstOfPossiblePair.ToString();
}
}
}
public static int ToUtf32(this string s, int pos)
=> Char.ConvertToUtf32(s, pos);
public static IEnumerable<GraphemeCluster> TextElements(this string s) {
// in: "1\u22152e\u0301\u0327a"
// out: [["1"], ["\u2215"], ["2"], [e, "\u0301", "\u0327"], "a"]]
// i.e. "12ȩ́a"
// becomes [["1"], [""], ["2"], ["e", "◌́", "◌̧̧"̧], ["a"]]
// TODO: also groups flag emojis based on unicode "tags" as a single element
var e = StringInfo.GetTextElementEnumerator(s);
var alreadyMoved = false;
while (alreadyMoved || e.MoveNext()) {
alreadyMoved = false;
// TODO: check whether UTF-16 allows for different
// encodings for the same code point and if so how
// to compare them correctly.
var te = e.Current.ToString();
var wavingBlackFlag = 0x1F3F4;
// TODO: check the role of "begin" for tag sequences.
var begin = 0xE0001;
// All the characters between sp and cancelTag are valid tag characters
var sp = 0xE0020;
var cancelTag = 0xE007F;
var first = te.ToUtf32(0);
// TODO: te.length is hardcoded as 2 because the tag
// code points all require a surrogate pair (i.e. don't
// fit in a single UTF-16 element).
if (te.Length == 2 && first == wavingBlackFlag || first == begin) {
while (e.MoveNext()) {
var te2 = e.Current.ToString();
var first2 = te2.ToUtf32(0);
if (te2.Length == 2 && first2 >= sp && first2 <= cancelTag) {
te += te2;
if (first2 == cancelTag) {
alreadyMoved = false;
break;
}
} else {
alreadyMoved = true;
}
}
}
yield return new GraphemeCluster(
false,
te,
te.SplitOnSurrogatePairs()
);
}
yield return new GraphemeCluster(
true,
"",
Enumerable.Empty<string>()
);
}
public static UnicodeCategory UnicodeCategory(this string s, int startIndex)
=> Char.GetUnicodeCategory(s, startIndex);
public static string Description(this GraphemeCluster gc) {
if (gc.endOfFile) {
return "end of file";
} else if (gc.str == "\"") {
return "'\"'";
} else {
return $"\"{gc.str}\"";
}
}
}