pebble/tests/fw/javascript/test_rocky_text_encoding.c

/*
 * Copyright 2024 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "test_jerry_port_common.h"
#include "test_rocky_common.h"
#include "applib/rockyjs/pbl_jerry_port.h"

#include "jerry-api.h"

#include <util/size.h>

#include <clar.h>
#include <stdio.h>

// Fakes
#include "fake_time.h"
#include "fake_logging.h"
#include "fake_pbl_malloc.h"

// Stubs
#include "stubs_app_state.h"
#include "stubs_logging.h"
#include "stubs_passert.h"

// Great read-up on JavaScript and its text encoding quirks:
// https://mathiasbynens.be/notes/javascript-unicode

void test_rocky_text_encoding__initialize(void) {
  fake_pbl_malloc_clear_tracking();
  rocky_runtime_context_init();
  jerry_init(JERRY_INIT_EMPTY);
}

void test_rocky_text_encoding__cleanup(void) {
  jerry_cleanup();
  rocky_runtime_context_deinit();
  fake_pbl_malloc_check_net_allocs();
}

void test_rocky_text_encoding__jerry_handles_cesu8_strings_in_source(void) {
  // Although CESU-8 and UTF-8 are not compatible on paper, JerryScript's lexer doesn't mind if
  // we feed it CESU-8 encoded strings... Test this, so we know when this changes in the future:
  EXECUTE_SCRIPT("var pileOfPooCESU8 = '\xed\xa0\xbd\xed\xb2\xa9';");
  // Expect a pair of surrogate code points:
  EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooCESU8.charCodeAt(0).toString(16)", "d83d");
  EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooCESU8.charCodeAt(1).toString(16)", "dca9");
}

void test_rocky_text_encoding__jerry_handles_utf8_strings_in_source(void) {
  // Source is be UTF-8 encoded.
  // Have a string variable with Pile of Poo (💩) or U+1F4A9 in it, encoded using 4-bytes:
  EXECUTE_SCRIPT("var pileOfPooUTF8 = '\xF0\x9F\x92\xA9';");
  // Expect a pair of surrogate code points:
  EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.charCodeAt(0).toString(16)", "d83d");
  EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.charCodeAt(1).toString(16)", "dca9");
}

void test_rocky_text_encoding__jerry_asserts_utf8_non_bmp_codepoint_in_identifier(void) {
  // It's forbidden to have an identifier contain a non-BMP codepoint (UTF-8 encoded):
  EXECUTE_SCRIPT_EXPECT_ERROR("var poo\xF0\x9F\x92\xA9poo = 'pileOfPoo';",
                              "SyntaxError: Invalid (unexpected) character. [line: 1, column: 8]");
}

void test_rocky_text_encoding__jerry_asserts_cesu8_non_bmp_codepoint_in_identifier(void) {
  // It's forbidden to have an identifier contain a non-BMP codepoint (CESU-8 encoded):
  EXECUTE_SCRIPT_EXPECT_ERROR("var poo\xed\xa0\xbd\xed\xb2\xa9poo = 'pileOfPoo';",
                              "SyntaxError: Invalid (unexpected) character. [line: 1, column: 8]");
}

void test_rocky_text_encoding__string_length(void) {
  EXECUTE_SCRIPT("var pileOfPooUTF8 = '\xF0\x9F\x92\xA9';");
  // String.length is expected to count the surrogate code points that make up a non-BMP codepoint:
  EXECUTE_SCRIPT_AND_ASSERT_RV_EQUALS_S("pileOfPooUTF8.length.toString()", "2");
}

void test_rocky_text_encoding__jerry_cesu8_to_utf8_conversion(void) {
  struct {
    const char *const script;
    size_t expected_utf_size;
    const char *const expected_utf_data;
  } cases[] = {
    [0] = {
      .script = "var str = '\\uDCA9';", // low surrogate only
      .expected_utf_size = 0,
    },
    [1] = {
      .script = "var str = '\\uD83D';", // high surrogate only
      .expected_utf_size = 0,
    },
    [2] = {
      .script = "var str = '\\uDCA9\\uD83D';", // reversed order
      .expected_utf_size = 0,
    },
    [3] = {
      .script = "var str = '\\uD83Dx\\uDCA9';", // non-surrogate in between pair
      .expected_utf_size = 1,
      .expected_utf_data = "x",
    },
    [4] = {
      .script = "var str = '\\uD83Dx';", // high surrogate followed by non-surrogate
      .expected_utf_size = 1,
      .expected_utf_data = "x",
    },
    [5] = {
      .script = "var str = '\\uDCA9x';", // low surrogate followed by non-surrogate
      .expected_utf_size = 1,
      .expected_utf_data = "x",
    },
    [6] = {
      .script = "var str = 'AB';",
      .expected_utf_size = 2,
      .expected_utf_data = "AB",
    },
    [7] = {
      .script = "var str = '\xC4\x91';", // 2-byte codepoint (U+0111)
      .expected_utf_size = 2,
      .expected_utf_data = "\xC4\x91",
    },
    [8] = {
      .script = "var str = '\xE0\xA0\x95';", // 3-byte codepoint (U+0815)
      .expected_utf_size = 3,
      .expected_utf_data = "\xE0\xA0\x95",
    },
    [9] = {
      .script = "var str = '\\uD83D\\uDCA9';", // 4-byte codepoint (U+1F4A9, escaped data)
      .expected_utf_size = 4,
      .expected_utf_data = "\xF0\x9F\x92\xA9",
    },
    [10] = {
      .script = "var str = '\xF0\x9F\x92\xA9';", // 4-byte codepoint (U+1F4A9, UTF-8 data in source)
      .expected_utf_size = 4,
      .expected_utf_data = "\xF0\x9F\x92\xA9",
    },
  };

  for (int j = 0; j < 2; ++j) {
    const bool is_overflow_test = (j == 1);
    for (int i = 0; i < ARRAY_LENGTH(cases); ++i) {
      printf("Case %i (is_overflow_test=%u): %s\n", i, is_overflow_test, cases[i].script);

      EXECUTE_SCRIPT(cases[i].script);
      const jerry_value_t s = JS_GLOBAL_GET_VALUE("str");

      const jerry_size_t utf8_size = jerry_get_utf8_string_size(s);
      // U+1F4A9 is expected to get encoded into 4 bytes of UTF-8:
      cl_assert_equal_i(utf8_size, cases[i].expected_utf_size);

      const size_t buffer_size = utf8_size ? (is_overflow_test ? (utf8_size - 1) : utf8_size) : 0;

      // malloc, so DUMA will detect buffer overflows:
      jerry_char_t *utf8_buffer = malloc(buffer_size);

      const jerry_size_t copied_size =
          jerry_string_to_utf8_char_buffer(s, utf8_buffer, buffer_size);
      if (!is_overflow_test) {
        cl_assert_equal_i(copied_size, cases[i].expected_utf_size);
        if (cases[i].expected_utf_size) {
          cl_assert_equal_m(utf8_buffer, cases[i].expected_utf_data, cases[i].expected_utf_size);
        }
      } else {
        // When buffer is too small, expect 0 bytes copied:
        cl_assert_equal_i(copied_size, 0);
      }
      jerry_release_value(s);

      free(utf8_buffer);
    }
  }
}

void test_rocky_text_encoding__jerry_utf8_to_cesu8_conversion(void) {
  struct {
    const char *const utf8_input;
    const char *const cesu8_output;
  } cases[] = {
    {
      .utf8_input = "",
      .cesu8_output = "",
    },
    {
      .utf8_input = "abc",
      .cesu8_output = "abc",
    },
    {
      // U+1F4A9 expands to surrogate pair:
      .utf8_input = "abc\xF0\x9F\x92\xA9xyz",
      .cesu8_output = "abc\xed\xa0\xbd\xed\xb2\xa9xyz",
    },
    {
      // Be lax with surrogates: even though they're not supposed to appear in UTF-8,
      // just copy them over to the CESU-8 output, even a "half pair":
      .utf8_input = "\xed\xa0\xbd",
      .cesu8_output = "\xed\xa0\xbd",
    },
  };
  for (int i = 0; i < ARRAY_LENGTH(cases); ++i) {
    jerry_char_t output[32] = {};
    const jerry_value_t s = jerry_create_string_utf8((const jerry_char_t *)cases[i].utf8_input);
    const jerry_size_t copied_bytes = jerry_string_to_char_buffer(s, output, sizeof(output));
    cl_assert_equal_i(copied_bytes, strlen(cases[i].cesu8_output));
    if (copied_bytes) {
      cl_assert_equal_m(output, cases[i].cesu8_output, copied_bytes);
    }
    // TODO: test equality/hash
    jerry_release_value(s);
  }
}