pebble/third_party/utf8/utf8.h

#pragma once

#include "applib/fonts/codepoint.h"
#include "util/iterator.h"

#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>

typedef uint8_t utf8_t;

#define UTF8_ELLIPSIS_STRING ("\xe2\x80\xa6")

////////////////////////////////////////////////////////////
// UTF-8 Internal API

//! Validate a UTF-8 encoded c-string.
//! @param string A null-terminated UTF-8 c-string.
//! @return True if the string is valid UTF-8, false otherwise
bool utf8_is_valid_string(const char *string);

//! Move past the current codepoint to the start of the next codepoint.
//! @param start A null-terminated UTF-8 c-string.
//! @return pointer to the next codepoint if one can be found, NULL otherwise
utf8_t *utf8_get_next(utf8_t *start);

//! Move before the current codepoint to the start of the previous codepoint.
//! @param start The start of the utf-8 string.
//! @param current The current utf-8 codepoint in the string
//! @note: we assume utf8_get_next was used previously and thus the utf8 is well formed
utf8_t *utf8_get_previous(utf8_t *start, utf8_t *current);

//! Peek at the string and return the next codepoint
//! @return next codepoint if one can be found, GRAPHICS_INVALID_STREAM otherwise
uint32_t utf8_peek_codepoint(utf8_t *string, utf8_t **next_ptr);

//! Copies the UTF-8 character at origin to dest, given there is a valid character and it fits.
//! Does nothing and returns zero if not.
//! @param dest Pointer to the buffer to copy a character into.
//! @param origin Pointer to a utf-8 character to copy.
//! @param length Maximum number of bytes to copy.
//! @return The number of bytes copied.
size_t utf8_copy_character(utf8_t *dest, utf8_t *origin, size_t length);

//! Returns the length of the string if this length is less than \ref max_size bytes. Otherwise, it
//! returns the length of the string up until the end of the last valid codepoint that fits into
//! \ref max_size bytes and \ref truncated is set to true (it is set to false if the string is not
//! truncated)
//! @param text A null-terminated UTF-8 c-string.
//! @param max_size maximum allowable size, in bytes, of the string (including null terminator)
//! @return length of string in bytes (will always be less than \ref max_size)
size_t utf8_get_size_truncate(const char *text, size_t max_size);

//! Truncates \ref in_string to at most \ref max_length bytes (including the null
//! terminator) with ellipsis.
//! @param in_string A null-terminated UTF-8 c-string.
//! @param[out] out_buffer A buffer where the truncated string will be output,
//!                        must have length at least max_length.
//! @param max_length Max allowable size bytes of the output string (including null terminator).
//! @return Length of output string in bytes (always less than or equal to max_length).
size_t utf8_truncate_with_ellipsis(const char *in_string, char *out_buffer, size_t max_length);

////////////////////////////////////////////////////////////
// UTF-8 Iterator API

typedef struct {
  utf8_t *start;
  utf8_t *end; //<! Points to first un-decodable codepoint
} Utf8Bounds;

typedef struct {
  Utf8Bounds const  *bounds;
  utf8_t *current; //<! Must be within bounds, inclusive; advancing past trips assert
  utf8_t *next;
  uint32_t codepoint; //! Cached current codepoint
} Utf8IterState;

Utf8Bounds utf8_get_bounds(bool *const success, char const *text);

void utf8_iter_init(Iterator *utf8_iter, Utf8IterState *utf8_iter_state, Utf8Bounds const  *bounds, utf8_t *start);

bool utf8_iter_next(IteratorState state);

bool utf8_iter_prev(IteratorState state);

//! A Codepoint callback will be called for each codepoint
//! @param index int of the current codepoint index
//! @param codepoint the current Codepoint of the iteration
//! @param context user context that is passed for each iteration
//! @return true to continue the iterator, otherwise false to break the iteration
typedef bool (*Utf8EachCodepoint)(int index, Codepoint codepoint, void *context);

//! Calls a user given Utf8EachCodepoint callback for each codepoint given a valid UTF-8 c-string
//! @param str a null-terminated UTF-8 c-string
//! @param callback Utf8EachCodepoint callback
//! @param context user context to be passed to the callback
//! @return true if the string was a valid UTF-8 c-string, false otherwise
bool utf8_each_codepoint(const char *str, Utf8EachCodepoint callback, void *context);