mirror of
https://github.com/google/pebble.git
synced 2025-03-15 16:51:21 +00:00
343 lines
9.9 KiB
C
343 lines
9.9 KiB
C
#include "utf8.h"
|
|
|
|
#include "system/passert.h"
|
|
#include "system/logging.h"
|
|
|
|
#include "util/iterator.h"
|
|
#include "util/math.h"
|
|
#include "util/size.h"
|
|
#include "util/string.h"
|
|
|
|
#include <inttypes.h>
|
|
#include <stdbool.h>
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
static const unsigned int VALID_UTF8 = 0;
|
|
|
|
static const uint8_t utf8d[] = {
|
|
// The first part of the table maps bytes to character classes that
|
|
// to reduce the size of the transition table and create bitmasks.
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
|
|
// The second part is a transition table that maps a combination
|
|
// of a state of the automaton and a character class to a state.
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
};
|
|
|
|
static uint32_t utf8_decode(uint8_t *state, uint32_t *codepoint, uint32_t byte) {
|
|
uint32_t type = utf8d[byte];
|
|
|
|
*codepoint = (*state != VALID_UTF8) ?
|
|
(byte & 0x3fu) | (*codepoint << 6) :
|
|
(0xff >> type) & (byte);
|
|
|
|
*state = utf8d[256 + *state + type];
|
|
return *state;
|
|
}
|
|
|
|
//! Print all code points in a c-string (debugging)
|
|
//! @param s A null-terminated c-string
|
|
void utf8_print_code_points(utf8_t *s) {
|
|
uint32_t codepoint;
|
|
uint8_t state = 0;
|
|
|
|
for (; *s; ++s) {
|
|
if (!utf8_decode(&state, &codepoint, *s)) {
|
|
PBL_LOG(LOG_LEVEL_ALWAYS, "U+%04"PRIX32, codepoint);
|
|
}
|
|
}
|
|
|
|
if (state != VALID_UTF8) {
|
|
PBL_LOG(LOG_LEVEL_ALWAYS, "String is not well-formed");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////
|
|
// Private API
|
|
|
|
//! Peek at the string and return the next codepoint
|
|
uint32_t utf8_peek_codepoint(utf8_t *stream, utf8_t **next_ptr) {
|
|
uint32_t codepoint = 0;
|
|
uint8_t state = 0;
|
|
|
|
if (stream == NULL) {
|
|
return 0;
|
|
}
|
|
|
|
for (; *stream; stream++) {
|
|
if (utf8_decode(&state, &codepoint, *stream)) {
|
|
// not done, loop again
|
|
continue;
|
|
}
|
|
if (next_ptr) {
|
|
*next_ptr = ++stream;
|
|
}
|
|
return codepoint;
|
|
}
|
|
|
|
if (next_ptr) {
|
|
*next_ptr = NULL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
utf8_t *utf8_get_next(utf8_t *stream) {
|
|
uint32_t codepoint = 0;
|
|
uint8_t state = 0;
|
|
|
|
if (stream == NULL) {
|
|
return stream;
|
|
}
|
|
|
|
for (; *stream; stream++) {
|
|
if (!utf8_decode(&state, &codepoint, *stream)) {
|
|
// Valid codepoint found; advance to start of next code point
|
|
return ++stream;
|
|
}
|
|
}
|
|
|
|
// No valid codepoint found
|
|
return NULL;
|
|
}
|
|
|
|
// see http://stackoverflow.com/questions/22257486/iterate-backwards-through-a-utf8-multibyte-string
|
|
utf8_t *utf8_get_previous(utf8_t *start, utf8_t *stream) {
|
|
do {
|
|
if (stream <= start) {
|
|
return NULL;
|
|
}
|
|
--stream;
|
|
} while ((*stream & 0xc0) == 0x80);
|
|
|
|
return stream;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////
|
|
// Public API
|
|
|
|
//! Return NULL if not successful in decoding text
|
|
utf8_t *utf8_get_end(const char *text) {
|
|
if (text == NULL) {
|
|
return (utf8_t *) text;
|
|
}
|
|
|
|
uint8_t *stream = (uint8_t *) text;
|
|
uint32_t codepoint = 0;
|
|
uint8_t state = 0;
|
|
|
|
while (*stream) {
|
|
utf8_decode(&state, &codepoint, *stream);
|
|
stream++;
|
|
}
|
|
|
|
bool success = (state == VALID_UTF8);
|
|
if (!success) {
|
|
return NULL;
|
|
}
|
|
|
|
return (utf8_t *) stream;
|
|
}
|
|
|
|
|
|
bool utf8_is_valid_string(const char *char_stream) {
|
|
return (utf8_get_end(char_stream) != NULL);
|
|
}
|
|
|
|
Utf8Bounds utf8_get_bounds(bool *const success, char const *text) {
|
|
Utf8Bounds bounds;
|
|
bounds.start = (utf8_t *) text;
|
|
bounds.end = bounds.start;
|
|
|
|
utf8_t *end = utf8_get_end(text);
|
|
|
|
if (NULL == end) {
|
|
*success = false;
|
|
return bounds;
|
|
}
|
|
|
|
bounds.end = end;
|
|
*success = true;
|
|
return bounds;
|
|
}
|
|
|
|
bool utf8_bounds_init(Utf8Bounds *bounds, const char *text) {
|
|
bounds->start = (utf8_t *) text;
|
|
bounds->end = bounds->start;
|
|
|
|
utf8_t *end = utf8_get_end(text);
|
|
|
|
if (end == NULL) {
|
|
return false;
|
|
}
|
|
|
|
bounds->end = end;
|
|
return true;
|
|
}
|
|
|
|
bool utf8_iter_next(IteratorState state) {
|
|
Utf8IterState *utf8_iter_state = (Utf8IterState *) state;
|
|
PBL_ASSERTN(utf8_iter_state);
|
|
|
|
utf8_iter_state->codepoint = 0; // Invalidate the cached codepoint
|
|
|
|
if (utf8_iter_state->current >= utf8_iter_state->bounds->end) {
|
|
return false;
|
|
}
|
|
|
|
utf8_iter_state->current = utf8_iter_state->next;
|
|
|
|
if (utf8_iter_state->current == NULL) {
|
|
return false;
|
|
}
|
|
|
|
if (*utf8_iter_state->current == '\0') {
|
|
return false;
|
|
}
|
|
|
|
utf8_iter_state->codepoint = utf8_peek_codepoint(utf8_iter_state->current, &utf8_iter_state->next);
|
|
return true;
|
|
}
|
|
|
|
bool utf8_iter_prev(IteratorState state) {
|
|
Utf8IterState *utf8_iter_state = (Utf8IterState *) state;
|
|
PBL_ASSERTN(utf8_iter_state);
|
|
|
|
utf8_iter_state->codepoint = 0;
|
|
|
|
if (utf8_iter_state->current <= utf8_iter_state->bounds->start) {
|
|
return false;
|
|
}
|
|
|
|
utf8_iter_state->current = utf8_get_previous(utf8_iter_state->bounds->start,
|
|
utf8_iter_state->current);
|
|
utf8_iter_state->codepoint = utf8_peek_codepoint(utf8_iter_state->current, &utf8_iter_state->next);
|
|
return true;
|
|
|
|
}
|
|
|
|
void utf8_iter_init(Iterator *utf8_iter, Utf8IterState *utf8_iter_state, Utf8Bounds const *bounds, utf8_t *start) {
|
|
PBL_ASSERTN(utf8_iter_state);
|
|
PBL_ASSERTN(bounds);
|
|
|
|
utf8_iter_state->bounds = bounds;
|
|
PBL_ASSERTN(start >= bounds->start);
|
|
PBL_ASSERTN(start <= bounds->end);
|
|
utf8_iter_state->current = start;
|
|
utf8_iter_state->codepoint = utf8_peek_codepoint(start, &utf8_iter_state->next);
|
|
|
|
iter_init(utf8_iter, (IteratorCallback) utf8_iter_next, utf8_iter_prev, (IteratorState) utf8_iter_state);
|
|
}
|
|
|
|
size_t utf8_copy_character(utf8_t *dest, utf8_t *origin, size_t length) {
|
|
utf8_t *next_char = utf8_get_next(origin);
|
|
// If next_char is NULL, we were asked to copy the last character, so just take the end of the
|
|
// string.
|
|
if (next_char == NULL) {
|
|
next_char = utf8_get_end((char *)origin);
|
|
// If we can't get the end, bail out.
|
|
if (next_char == NULL) {
|
|
return 0;
|
|
}
|
|
}
|
|
size_t len = next_char - origin;
|
|
// Never copy a partial character; if it won't fit, do nothing.
|
|
if (len > length) {
|
|
return 0;
|
|
}
|
|
memcpy(dest, origin, len);
|
|
return len;
|
|
}
|
|
|
|
size_t utf8_get_size_truncate(const char *text, size_t max_size) {
|
|
PBL_ASSERTN(text);
|
|
PBL_ASSERTN(max_size > 0);
|
|
|
|
size_t len = strnlen(text, max_size);
|
|
if (len == 0) {
|
|
return len;
|
|
}
|
|
|
|
// get the start of the previous character if the string is too long
|
|
if (max_size == len) {
|
|
// src[len] is be valid because strnlen indicated that the source string is at least len
|
|
// characters, therefore len can, at worst, only be the end of the string
|
|
utf8_t *end = utf8_get_previous((utf8_t *)text, (utf8_t *)&text[len]);
|
|
len = end - (utf8_t *)text;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
size_t utf8_truncate_with_ellipsis(const char *in_string, char *out_buffer, size_t max_length) {
|
|
const char ellipsis[] = UTF8_ELLIPSIS_STRING;
|
|
const size_t ellipsis_length = ARRAY_LENGTH(ellipsis);
|
|
if (max_length < ellipsis_length) {
|
|
return 0;
|
|
}
|
|
const size_t in_length_bytes = strlen(in_string) + 1;
|
|
const size_t clamped_in_length_bytes = MIN(in_length_bytes, max_length - (ellipsis_length - 1));
|
|
if (in_length_bytes > max_length) {
|
|
// finds where the ellipsis should start, by asking utf8_get_size_truncate
|
|
const size_t ellipsis_start_offset = utf8_get_size_truncate(in_string, clamped_in_length_bytes);
|
|
strncpy(out_buffer, in_string, ellipsis_start_offset);
|
|
strncpy(&out_buffer[ellipsis_start_offset], ellipsis, ellipsis_length);
|
|
return ellipsis_start_offset + ellipsis_length;
|
|
} else {
|
|
strncpy(out_buffer, in_string, in_length_bytes);
|
|
}
|
|
return in_length_bytes;
|
|
}
|
|
|
|
bool utf8_each_codepoint(const char *str, Utf8EachCodepoint callback, void *context) {
|
|
Iterator utf8_iter;
|
|
Utf8IterState utf8_iter_state;
|
|
|
|
bool success = false;
|
|
const Utf8Bounds utf8_bounds = utf8_get_bounds(&success, str);
|
|
if (!success) {
|
|
return false;
|
|
}
|
|
|
|
utf8_iter_init(&utf8_iter, &utf8_iter_state, &utf8_bounds, utf8_bounds.start);
|
|
|
|
int i = 0;
|
|
while (utf8_iter_state.codepoint &&
|
|
callback(i++, utf8_iter_state.codepoint, context) &&
|
|
iter_next(&utf8_iter)) {}
|
|
return true;
|
|
}
|