pebble/tests/fw/javascript/test_rocky_text_encoding.c
2025-01-27 11:38:16 -08:00

220 lines
7.7 KiB
C

/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "test_jerry_port_common.h"
#include "test_rocky_common.h"
#include "applib/rockyjs/pbl_jerry_port.h"
#include "jerry-api.h"
#include <util/size.h>
#include <clar.h>
#include <stdio.h>
// Fakes
#include "fake_time.h"
#include "fake_logging.h"
#include "fake_pbl_malloc.h"
// Stubs
#include "stubs_app_state.h"
#include "stubs_logging.h"
#include "stubs_passert.h"
// Great read-up on JavaScript and its text encoding quirks:
// https://mathiasbynens.be/notes/javascript-unicode
void test_rocky_text_encoding__initialize(void) {
fake_pbl_malloc_clear_tracking();
rocky_runtime_context_init();
jerry_init(JERRY_INIT_EMPTY);
}
void test_rocky_text_encoding__cleanup(void) {
jerry_cleanup();
rocky_runtime_context_deinit();
fake_pbl_malloc_check_net_allocs();
}
void test_rocky_text_encoding__jerry_handles_cesu8_strings_in_source(void) {
// Although CESU-8 and UTF-8 are not compatible on paper, JerryScript's lexer doesn't mind if
// we feed it CESU-8 encoded strings... Test this, so we know when this changes in the future:
EXECUTE_SCRIPT("var pileOfPooCESU8 = '\xed\xa0\xbd\xed\xb2\xa9';");
// Expect a pair of surrogate code points:
EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooCESU8.charCodeAt(0).toString(16)", "d83d");
EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooCESU8.charCodeAt(1).toString(16)", "dca9");
}
void test_rocky_text_encoding__jerry_handles_utf8_strings_in_source(void) {
// Source is be UTF-8 encoded.
// Have a string variable with Pile of Poo (💩) or U+1F4A9 in it, encoded using 4-bytes:
EXECUTE_SCRIPT("var pileOfPooUTF8 = '\xF0\x9F\x92\xA9';");
// Expect a pair of surrogate code points:
EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.charCodeAt(0).toString(16)", "d83d");
EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.charCodeAt(1).toString(16)", "dca9");
}
void test_rocky_text_encoding__jerry_asserts_utf8_non_bmp_codepoint_in_identifier(void) {
// It's forbidden to have an identifier contain a non-BMP codepoint (UTF-8 encoded):
EXECUTE_SCRIPT_EXPECT_ERROR("var poo\xF0\x9F\x92\xA9poo = 'pileOfPoo';",
"SyntaxError: Invalid (unexpected) character. [line: 1, column: 8]");
}
void test_rocky_text_encoding__jerry_asserts_cesu8_non_bmp_codepoint_in_identifier(void) {
// It's forbidden to have an identifier contain a non-BMP codepoint (CESU-8 encoded):
EXECUTE_SCRIPT_EXPECT_ERROR("var poo\xed\xa0\xbd\xed\xb2\xa9poo = 'pileOfPoo';",
"SyntaxError: Invalid (unexpected) character. [line: 1, column: 8]");
}
void test_rocky_text_encoding__string_length(void) {
EXECUTE_SCRIPT("var pileOfPooUTF8 = '\xF0\x9F\x92\xA9';");
// String.length is expected to count the surrogate code points that make up a non-BMP codepoint:
EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.length.toString()", "2");
}
void test_rocky_text_encoding__jerry_cesu8_to_utf8_conversion(void) {
struct {
const char *const script;
size_t expected_utf_size;
const char *const expected_utf_data;
} cases[] = {
[0] = {
.script = "var str = '\\uDCA9';", // low surrogate only
.expected_utf_size = 0,
},
[1] = {
.script = "var str = '\\uD83D';", // high surrogate only
.expected_utf_size = 0,
},
[2] = {
.script = "var str = '\\uDCA9\\uD83D';", // reversed order
.expected_utf_size = 0,
},
[3] = {
.script = "var str = '\\uD83Dx\\uDCA9';", // non-surrogate in between pair
.expected_utf_size = 1,
.expected_utf_data = "x",
},
[4] = {
.script = "var str = '\\uD83Dx';", // high surrogate followed by non-surrogate
.expected_utf_size = 1,
.expected_utf_data = "x",
},
[5] = {
.script = "var str = '\\uDCA9x';", // low surrogate followed by non-surrogate
.expected_utf_size = 1,
.expected_utf_data = "x",
},
[6] = {
.script = "var str = 'AB';",
.expected_utf_size = 2,
.expected_utf_data = "AB",
},
[7] = {
.script = "var str = '\xC4\x91';", // 2-byte codepoint (U+0111)
.expected_utf_size = 2,
.expected_utf_data = "\xC4\x91",
},
[8] = {
.script = "var str = '\xE0\xA0\x95';", // 3-byte codepoint (U+0815)
.expected_utf_size = 3,
.expected_utf_data = "\xE0\xA0\x95",
},
[9] = {
.script = "var str = '\\uD83D\\uDCA9';", // 4-byte codepoint (U+1F4A9, escaped data)
.expected_utf_size = 4,
.expected_utf_data = "\xF0\x9F\x92\xA9",
},
[10] = {
.script = "var str = '\xF0\x9F\x92\xA9';", // 4-byte codepoint (U+1F4A9, UTF-8 data in source)
.expected_utf_size = 4,
.expected_utf_data = "\xF0\x9F\x92\xA9",
},
};
for (int j = 0; j < 2; ++j) {
const bool is_overflow_test = (j == 1);
for (int i = 0; i < ARRAY_LENGTH(cases); ++i) {
printf("Case %i (is_overflow_test=%u): %s\n", i, is_overflow_test, cases[i].script);
EXECUTE_SCRIPT(cases[i].script);
const jerry_value_t s = JS_GLOBAL_GET_VALUE("str");
const jerry_size_t utf8_size = jerry_get_utf8_string_size(s);
// U+1F4A9 is expected to get encoded into 4 bytes of UTF-8:
cl_assert_equal_i(utf8_size, cases[i].expected_utf_size);
const size_t buffer_size = utf8_size ? (is_overflow_test ? (utf8_size - 1) : utf8_size) : 0;
// malloc, so DUMA will detect buffer overflows:
jerry_char_t *utf8_buffer = malloc(buffer_size);
const jerry_size_t copied_size =
jerry_string_to_utf8_char_buffer(s, utf8_buffer, buffer_size);
if (!is_overflow_test) {
cl_assert_equal_i(copied_size, cases[i].expected_utf_size);
if (cases[i].expected_utf_size) {
cl_assert_equal_m(utf8_buffer, cases[i].expected_utf_data, cases[i].expected_utf_size);
}
} else {
// When buffer is too small, expect 0 bytes copied:
cl_assert_equal_i(copied_size, 0);
}
jerry_release_value(s);
free(utf8_buffer);
}
}
}
void test_rocky_text_encoding__jerry_utf8_to_cesu8_conversion(void) {
struct {
const char *const utf8_input;
const char *const cesu8_output;
} cases[] = {
{
.utf8_input = "",
.cesu8_output = "",
},
{
.utf8_input = "abc",
.cesu8_output = "abc",
},
{
// U+1F4A9 expands to surrogate pair:
.utf8_input = "abc\xF0\x9F\x92\xA9xyz",
.cesu8_output = "abc\xed\xa0\xbd\xed\xb2\xa9xyz",
},
{
// Be lax with surrogates: even though they're not supposed to appear in UTF-8,
// just copy them over to the CESU-8 output, even a "half pair":
.utf8_input = "\xed\xa0\xbd",
.cesu8_output = "\xed\xa0\xbd",
},
};
for (int i = 0; i < ARRAY_LENGTH(cases); ++i) {
jerry_char_t output[32] = {};
const jerry_value_t s = jerry_create_string_utf8((const jerry_char_t *)cases[i].utf8_input);
const jerry_size_t copied_bytes = jerry_string_to_char_buffer(s, output, sizeof(output));
cl_assert_equal_i(copied_bytes, strlen(cases[i].cesu8_output));
if (copied_bytes) {
cl_assert_equal_m(output, cases[i].cesu8_output, copied_bytes);
}
// TODO: test equality/hash
jerry_release_value(s);
}
}