Fix incorrect base64 encoding of astral-plane text (#4813)

Most astral-plane text is emojis like U+1F4DA BOOKS (📚), but some
languages like Osage have their alphabet entirely in the supplementary
multilingual plane as well. For proper support of languages like Osage,
and newer emojis, the UTF-8 decode and encode functions need to properly
handle codepoints above U+FFFF, which are represented by a surrogate
pair in Javascript strings.
This commit is contained in:
Robin Munn
2020-08-17 23:33:46 +07:00
committed by GitHub
parent f74c49f393
commit c23eedd069
3 changed files with 43 additions and 12 deletions

View File

@@ -25,6 +25,19 @@ describe("Utility tests", function() {
expect(psa(" [[Tidd\u00a0ler8]] two ")).toEqual(["Tidd\u00a0ler8","two"]);
});
it("should handle base64 encoding emojis", function() {
var booksEmoji = "📚";
expect(booksEmoji).toBe(booksEmoji);
// 📚 is U+1F4DA BOOKS, which is represented by surrogate pair 0xD83D 0xDCDA in Javascript
expect(booksEmoji.length).toBe(2);
expect(booksEmoji.charCodeAt(0)).toBe(55357); // 0xD83D
expect(booksEmoji.charCodeAt(1)).toBe(56538); // 0xDCDA
expect($tw.utils.base64Encode(booksEmoji)).not.toBe("7aC97bOa", "if base64 is 7aC97bOa then surrogate pairs were incorrectly treated as codepoints");
expect($tw.utils.base64Encode(booksEmoji)).toBe("8J+Tmg==", "if surrogate pairs are correctly treated as a single code unit then base64 should be 8J+Tmg==");
expect($tw.utils.base64Decode("8J+Tmg==")).toBe(booksEmoji);
expect($tw.utils.base64Decode($tw.utils.base64Encode(booksEmoji))).toBe(booksEmoji, "should round-trip correctly");
});
it("should handle stringifying a string array", function() {
var str = $tw.utils.stringifyList;
expect(str([])).toEqual("");