mirror of
https://github.com/google/pebble.git
synced 2025-07-04 22:00:38 -04:00
Import of the watch repository from Pebble
This commit is contained in:
commit
3b92768480
10334 changed files with 2564465 additions and 0 deletions
445
third_party/jerryscript/jerry-core/parser/regexp/re-bytecode.c
vendored
Normal file
445
third_party/jerryscript/jerry-core/parser/regexp/re-bytecode.c
vendored
Normal file
|
@ -0,0 +1,445 @@
|
|||
/* Copyright 2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecma-globals.h"
|
||||
#include "re-bytecode.h"
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_bytecode Bytecode
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Size of block of RegExp bytecode. Used for allocation
|
||||
*/
|
||||
#define REGEXP_BYTECODE_BLOCK_SIZE 256UL
|
||||
|
||||
/**
|
||||
* Realloc the bytecode container
|
||||
*
|
||||
* @return current position in RegExp bytecode
|
||||
*/
|
||||
static uint8_t *
|
||||
re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
|
||||
{
|
||||
JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p);
|
||||
size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
|
||||
|
||||
/* If one of the members of RegExp bytecode context is NULL, then all member should be NULL
|
||||
* (it means first allocation), otherwise all of the members should be a non NULL pointer. */
|
||||
JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p)
|
||||
|| (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p));
|
||||
|
||||
size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE;
|
||||
JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p);
|
||||
size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p);
|
||||
|
||||
uint8_t *new_block_start_p = (uint8_t *) jmem_heap_alloc_block (new_block_size);
|
||||
if (bc_ctx_p->current_p)
|
||||
{
|
||||
memcpy (new_block_start_p, bc_ctx_p->block_start_p, (size_t) (current_ptr_offset));
|
||||
jmem_heap_free_block (bc_ctx_p->block_start_p, old_size);
|
||||
}
|
||||
bc_ctx_p->block_start_p = new_block_start_p;
|
||||
bc_ctx_p->block_end_p = new_block_start_p + new_block_size;
|
||||
bc_ctx_p->current_p = new_block_start_p + current_ptr_offset;
|
||||
|
||||
return bc_ctx_p->current_p;
|
||||
} /* re_realloc_regexp_bytecode_block */
|
||||
|
||||
/**
|
||||
* Append a new bytecode to the and of the bytecode container
|
||||
*/
|
||||
static void
|
||||
re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint8_t *bytecode_p, /**< input bytecode */
|
||||
size_t length) /**< length of input */
|
||||
{
|
||||
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
|
||||
|
||||
uint8_t *current_p = bc_ctx_p->current_p;
|
||||
if (current_p + length > bc_ctx_p->block_end_p)
|
||||
{
|
||||
current_p = re_realloc_regexp_bytecode_block (bc_ctx_p);
|
||||
}
|
||||
|
||||
memcpy (current_p, bytecode_p, length);
|
||||
bc_ctx_p->current_p += length;
|
||||
} /* re_bytecode_list_append */
|
||||
|
||||
/**
|
||||
* Insert a new bytecode to the bytecode container
|
||||
*/
|
||||
void
|
||||
re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
size_t offset, /**< distance from the start of the container */
|
||||
uint8_t *bytecode_p, /**< input bytecode */
|
||||
size_t length) /**< length of input */
|
||||
{
|
||||
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
|
||||
|
||||
uint8_t *current_p = bc_ctx_p->current_p;
|
||||
if (current_p + length > bc_ctx_p->block_end_p)
|
||||
{
|
||||
re_realloc_regexp_bytecode_block (bc_ctx_p);
|
||||
}
|
||||
|
||||
uint8_t *src_p = bc_ctx_p->block_start_p + offset;
|
||||
if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0)
|
||||
{
|
||||
uint8_t *dest_p = src_p + length;
|
||||
uint8_t *tmp_block_start_p;
|
||||
tmp_block_start_p = (uint8_t *) jmem_heap_alloc_block (re_get_bytecode_length (bc_ctx_p) - offset);
|
||||
memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
|
||||
memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
|
||||
jmem_heap_free_block (tmp_block_start_p, re_get_bytecode_length (bc_ctx_p) - offset);
|
||||
}
|
||||
memcpy (src_p, bytecode_p, length);
|
||||
|
||||
bc_ctx_p->current_p += length;
|
||||
} /* re_bytecode_list_insert */
|
||||
|
||||
/**
|
||||
* Get a character from the RegExp bytecode and increase the bytecode position
|
||||
*
|
||||
* @return ecma character
|
||||
*/
|
||||
ecma_char_t __attr_always_inline___
|
||||
re_get_char (uint8_t **bc_p) /**< pointer to bytecode start */
|
||||
{
|
||||
ecma_char_t chr = *((ecma_char_t *) *bc_p);
|
||||
(*bc_p) += sizeof (ecma_char_t);
|
||||
return chr;
|
||||
} /* re_get_char */
|
||||
|
||||
/**
|
||||
* Get a RegExp opcode and increase the bytecode position
|
||||
*
|
||||
* @return current RegExp opcode
|
||||
*/
|
||||
re_opcode_t __attr_always_inline___
|
||||
re_get_opcode (uint8_t **bc_p) /**< pointer to bytecode start */
|
||||
{
|
||||
uint8_t bytecode = **bc_p;
|
||||
(*bc_p) += sizeof (uint8_t);
|
||||
return (re_opcode_t) bytecode;
|
||||
} /* re_get_opcode */
|
||||
|
||||
/**
|
||||
* Get a parameter of a RegExp opcode and increase the bytecode position
|
||||
*
|
||||
* @return opcode parameter
|
||||
*/
|
||||
uint32_t __attr_always_inline___
|
||||
re_get_value (uint8_t **bc_p) /**< pointer to bytecode start */
|
||||
{
|
||||
uint32_t value = *((uint32_t *) *bc_p);
|
||||
(*bc_p) += sizeof (uint32_t);
|
||||
return value;
|
||||
} /* re_get_value */
|
||||
|
||||
/**
|
||||
* Get length of bytecode
|
||||
*
|
||||
* @return bytecode length (unsigned integer)
|
||||
*/
|
||||
uint32_t __attr_pure___ __attr_always_inline___
|
||||
re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
|
||||
{
|
||||
return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p));
|
||||
} /* re_get_bytecode_length */
|
||||
|
||||
/**
|
||||
* Append a RegExp opcode
|
||||
*/
|
||||
void
|
||||
re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
re_opcode_t opcode) /**< input opcode */
|
||||
{
|
||||
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &opcode, sizeof (uint8_t));
|
||||
} /* re_append_opcode */
|
||||
|
||||
/**
|
||||
* Append a parameter of a RegExp opcode
|
||||
*/
|
||||
void
|
||||
re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &value, sizeof (uint32_t));
|
||||
} /* re_append_u32 */
|
||||
|
||||
/**
|
||||
* Append a character to the RegExp bytecode
|
||||
*/
|
||||
void
|
||||
re_append_char (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
ecma_char_t input_char) /**< input char */
|
||||
{
|
||||
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &input_char, sizeof (ecma_char_t));
|
||||
} /* re_append_char */
|
||||
|
||||
/**
|
||||
* Append a jump offset parameter of a RegExp opcode
|
||||
*/
|
||||
void
|
||||
re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
value += (uint32_t) (sizeof (uint32_t));
|
||||
re_append_u32 (bc_ctx_p, value);
|
||||
} /* re_append_jump_offset */
|
||||
|
||||
/**
|
||||
* Insert a RegExp opcode
|
||||
*/
|
||||
void
|
||||
re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t offset, /**< distance from the start of the container */
|
||||
re_opcode_t opcode) /**< input opcode */
|
||||
{
|
||||
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &opcode, sizeof (uint8_t));
|
||||
} /* re_insert_opcode */
|
||||
|
||||
/**
|
||||
* Insert a parameter of a RegExp opcode
|
||||
*/
|
||||
void
|
||||
re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t offset, /**< distance from the start of the container */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &value, sizeof (uint32_t));
|
||||
} /* re_insert_u32 */
|
||||
|
||||
#ifdef REGEXP_DUMP_BYTE_CODE
|
||||
/**
|
||||
* RegExp bytecode dumper
|
||||
*/
|
||||
void
|
||||
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
|
||||
{
|
||||
re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) bc_ctx_p->block_start_p;
|
||||
JERRY_DEBUG_MSG ("%d ", compiled_code_p->header.status_flags);
|
||||
JERRY_DEBUG_MSG ("%d ", compiled_code_p->num_of_captures);
|
||||
JERRY_DEBUG_MSG ("%d | ", compiled_code_p->num_of_non_captures);
|
||||
|
||||
uint8_t *bytecode_p = (uint8_t *) (compiled_code_p + 1);
|
||||
|
||||
re_opcode_t op;
|
||||
while ((op = re_get_opcode (&bytecode_p)))
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case RE_OP_MATCH:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("MATCH, ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_CHAR:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("CHAR ");
|
||||
JERRY_DEBUG_MSG ("%c, ", (char) re_get_char (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("GZ_START ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("START ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CAPTURE_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("G_END ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("GZ_NC_START ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GROUP_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("NC_START ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("G_NC_END ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_SAVE_AT_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("RE_START ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_SAVE_AND_MATCH:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("RE_END, ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_GREEDY_ITERATOR:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("GREEDY_ITERATOR ");
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_GREEDY_ITERATOR:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("NON_GREEDY_ITERATOR ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_PERIOD:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("PERIOD ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ALTERNATIVE:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("ALTERNATIVE ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_START:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("ASSERT_START ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_END:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("ASSERT_END ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("ASSERT_WORD_BOUNDARY ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("ASSERT_NOT_WORD_BOUNDARY ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_LOOKAHEAD_POS:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("LOOKAHEAD_POS ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_LOOKAHEAD_NEG:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("LOOKAHEAD_NEG ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_BACKREFERENCE:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("BACKREFERENCE ");
|
||||
JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_INV_CHAR_CLASS:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("INV_");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CHAR_CLASS:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("CHAR_CLASS ");
|
||||
uint32_t num_of_class = re_get_value (&bytecode_p);
|
||||
JERRY_DEBUG_MSG ("%d", num_of_class);
|
||||
while (num_of_class)
|
||||
{
|
||||
JERRY_DEBUG_MSG (" %d", re_get_char (&bytecode_p));
|
||||
JERRY_DEBUG_MSG ("-%d", re_get_char (&bytecode_p));
|
||||
num_of_class--;
|
||||
}
|
||||
JERRY_DEBUG_MSG (", ");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
JERRY_DEBUG_MSG ("UNKNOWN(%d), ", (uint32_t) op);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
JERRY_DEBUG_MSG ("EOF\n");
|
||||
} /* re_dump_bytecode */
|
||||
#endif /* REGEXP_DUMP_BYTE_CODE */
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
129
third_party/jerryscript/jerry-core/parser/regexp/re-bytecode.h
vendored
Normal file
129
third_party/jerryscript/jerry-core/parser/regexp/re-bytecode.h
vendored
Normal file
|
@ -0,0 +1,129 @@
|
|||
/* Copyright 2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef RE_BYTECODE_H
|
||||
#define RE_BYTECODE_H
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
#include "ecma-globals.h"
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_bytecode Bytecode
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Size of the RegExp bytecode cache
|
||||
*/
|
||||
#define RE_CACHE_SIZE 8u
|
||||
|
||||
/**
|
||||
* RegExp flags mask (first 10 bits are for reference count and the rest for the actual RegExp flags)
|
||||
*/
|
||||
#define RE_FLAGS_MASK 0x3F
|
||||
|
||||
/**
|
||||
* RegExp opcodes
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
RE_OP_EOF,
|
||||
/* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
|
||||
* Change it carefully. Capture opcodes should be at first.
|
||||
*/
|
||||
RE_OP_CAPTURE_GROUP_START, /**< group start */
|
||||
RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */
|
||||
RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */
|
||||
RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */
|
||||
RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */
|
||||
RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */
|
||||
RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */
|
||||
RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */
|
||||
RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */
|
||||
RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */
|
||||
|
||||
RE_OP_MATCH, /**< match */
|
||||
RE_OP_CHAR, /**< any character */
|
||||
RE_OP_SAVE_AT_START, /**< save at start */
|
||||
RE_OP_SAVE_AND_MATCH, /**< save and match */
|
||||
RE_OP_PERIOD, /**< "." */
|
||||
RE_OP_ALTERNATIVE, /**< "|" */
|
||||
RE_OP_GREEDY_ITERATOR, /**< greedy iterator */
|
||||
RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */
|
||||
RE_OP_ASSERT_START, /**< "^" */
|
||||
RE_OP_ASSERT_END, /**< "$" */
|
||||
RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */
|
||||
RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
|
||||
RE_OP_LOOKAHEAD_POS, /**< lookahead pos */
|
||||
RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */
|
||||
RE_OP_BACKREFERENCE, /**< "\[0..9]" */
|
||||
RE_OP_CHAR_CLASS, /**< "[ ]" */
|
||||
RE_OP_INV_CHAR_CLASS /**< "[^ ]" */
|
||||
} re_opcode_t;
|
||||
|
||||
/**
|
||||
* Compiled byte code data.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
ecma_compiled_code_t header; /**< compiled code header */
|
||||
jmem_cpointer_t pattern_cp; /**< original RegExp pattern */
|
||||
uint32_t num_of_captures; /**< number of capturing brackets */
|
||||
uint32_t num_of_non_captures; /**< number of non capturing brackets */
|
||||
} re_compiled_code_t;
|
||||
|
||||
/**
|
||||
* Context of RegExp bytecode container
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *block_start_p; /**< start of bytecode block */
|
||||
uint8_t *block_end_p; /**< end of bytecode block */
|
||||
uint8_t *current_p; /**< current position in bytecode */
|
||||
} re_bytecode_ctx_t;
|
||||
|
||||
re_opcode_t re_get_opcode (uint8_t **);
|
||||
ecma_char_t re_get_char (uint8_t **);
|
||||
uint32_t re_get_value (uint8_t **);
|
||||
uint32_t re_get_bytecode_length (re_bytecode_ctx_t *);
|
||||
|
||||
void re_append_opcode (re_bytecode_ctx_t *, re_opcode_t);
|
||||
void re_append_u32 (re_bytecode_ctx_t *, uint32_t);
|
||||
void re_append_char (re_bytecode_ctx_t *, ecma_char_t);
|
||||
void re_append_jump_offset (re_bytecode_ctx_t *, uint32_t);
|
||||
|
||||
void re_insert_opcode (re_bytecode_ctx_t *, uint32_t, re_opcode_t);
|
||||
void re_insert_u32 (re_bytecode_ctx_t *, uint32_t, uint32_t);
|
||||
void re_bytecode_list_insert (re_bytecode_ctx_t *, size_t, uint8_t *, size_t);
|
||||
|
||||
#ifdef REGEXP_DUMP_BYTE_CODE
|
||||
void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
|
||||
#endif /* REGEXP_DUMP_BYTE_CODE */
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
||||
#endif /* !RE_BYTECODE_H */
|
650
third_party/jerryscript/jerry-core/parser/regexp/re-compiler.c
vendored
Normal file
650
third_party/jerryscript/jerry-core/parser/regexp/re-compiler.c
vendored
Normal file
|
@ -0,0 +1,650 @@
|
|||
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015-2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecma-exceptions.h"
|
||||
#include "ecma-helpers.h"
|
||||
#include "ecma-regexp-object.h"
|
||||
#include "ecma-try-catch-macro.h"
|
||||
#include "jcontext.h"
|
||||
#include "jrt-libc-includes.h"
|
||||
#include "jmem-heap.h"
|
||||
#include "re-bytecode.h"
|
||||
#include "re-compiler.h"
|
||||
#include "re-parser.h"
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_compiler Compiler
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Callback function of character class generation
|
||||
*/
|
||||
static void
|
||||
re_append_char_class (void *re_ctx_p, /**< RegExp compiler context */
|
||||
ecma_char_t start, /**< character class range from */
|
||||
ecma_char_t end) /**< character class range to */
|
||||
{
|
||||
re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t *) re_ctx_p;
|
||||
re_append_char (ctx_p->bytecode_ctx_p, start);
|
||||
re_append_char (ctx_p->bytecode_ctx_p, end);
|
||||
ctx_p->parser_ctx_p->num_of_classes++;
|
||||
} /* re_append_char_class */
|
||||
|
||||
/**
|
||||
* Insert simple atom iterator
|
||||
*/
|
||||
static void
|
||||
re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t new_atom_start_offset) /**< atom start offset */
|
||||
{
|
||||
uint32_t atom_code_length;
|
||||
uint32_t offset;
|
||||
uint32_t qmin, qmax;
|
||||
|
||||
qmin = re_ctx_p->current_token.qmin;
|
||||
qmax = re_ctx_p->current_token.qmax;
|
||||
JERRY_ASSERT (qmin <= qmax);
|
||||
|
||||
/* TODO: optimize bytecode length. Store 0 rather than INF */
|
||||
|
||||
re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */
|
||||
uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset);
|
||||
|
||||
offset = new_atom_start_offset;
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length);
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax);
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin);
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR);
|
||||
}
|
||||
else
|
||||
{
|
||||
re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR);
|
||||
}
|
||||
} /* re_insert_simple_iterator */
|
||||
|
||||
/**
|
||||
* Get the type of a group start
|
||||
*
|
||||
* @return RegExp opcode
|
||||
*/
|
||||
static re_opcode_t
|
||||
re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool is_capturable) /**< is capturable group */
|
||||
{
|
||||
if (is_capturable)
|
||||
{
|
||||
if (re_ctx_p->current_token.qmin == 0)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_GROUP_START;
|
||||
}
|
||||
|
||||
if (re_ctx_p->current_token.qmin == 0)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_GROUP_START;
|
||||
} /* re_get_start_opcode_type */
|
||||
|
||||
/**
|
||||
* Get the type of a group end
|
||||
*
|
||||
* @return RegExp opcode
|
||||
*/
|
||||
static re_opcode_t
|
||||
re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool is_capturable) /**< is capturable group */
|
||||
{
|
||||
if (is_capturable)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_CAPTURE_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_NON_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_NON_CAPTURE_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END;
|
||||
} /* re_get_end_opcode_type */
|
||||
|
||||
/**
|
||||
* Enclose the given bytecode to a group
|
||||
*/
|
||||
static void
|
||||
re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t group_start_offset, /**< offset of group start */
|
||||
uint32_t idx, /**< index of group */
|
||||
bool is_capturable) /**< is capturable group */
|
||||
{
|
||||
uint32_t qmin, qmax;
|
||||
re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable);
|
||||
re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable);
|
||||
uint32_t start_head_offset_len;
|
||||
|
||||
qmin = re_ctx_p->current_token.qmin;
|
||||
qmax = re_ctx_p->current_token.qmax;
|
||||
JERRY_ASSERT (qmin <= qmax);
|
||||
|
||||
start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx);
|
||||
re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode);
|
||||
start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len;
|
||||
re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode);
|
||||
re_append_u32 (re_ctx_p->bytecode_ctx_p, idx);
|
||||
re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin);
|
||||
re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax);
|
||||
|
||||
group_start_offset += start_head_offset_len;
|
||||
re_append_jump_offset (re_ctx_p->bytecode_ctx_p,
|
||||
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
|
||||
if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START)
|
||||
{
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p,
|
||||
group_start_offset,
|
||||
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
}
|
||||
} /* re_insert_into_group */
|
||||
|
||||
/**
|
||||
* Enclose the given bytecode to a group and inster jump value
|
||||
*/
|
||||
static void
|
||||
re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t group_start_offset, /**< offset of group start */
|
||||
uint32_t idx, /**< index of group */
|
||||
bool is_capturable) /**< is capturable group */
|
||||
{
|
||||
re_insert_u32 (re_ctx_p->bytecode_ctx_p,
|
||||
group_start_offset,
|
||||
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
|
||||
} /* re_insert_into_group_with_jump */
|
||||
|
||||
/**
|
||||
* Parse alternatives
|
||||
*
|
||||
* @return empty ecma value - if alternative was successfully parsed
|
||||
* error ecma value - otherwise
|
||||
*
|
||||
* Returned value must be freed with ecma_free_value
|
||||
*/
|
||||
static ecma_value_t
|
||||
re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool expect_eof) /**< expect end of file */
|
||||
{
|
||||
uint32_t idx;
|
||||
re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p;
|
||||
ecma_value_t ret_value = ecma_make_simple_value (ECMA_SIMPLE_VALUE_EMPTY);
|
||||
|
||||
uint32_t alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
bool should_loop = true;
|
||||
|
||||
while (ecma_is_value_empty (ret_value) && should_loop)
|
||||
{
|
||||
ECMA_TRY_CATCH (empty,
|
||||
re_parse_next_token (re_ctx_p->parser_ctx_p,
|
||||
&(re_ctx_p->current_token)),
|
||||
ret_value);
|
||||
|
||||
uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
|
||||
switch (re_ctx_p->current_token.type)
|
||||
{
|
||||
case RE_TOK_START_CAPTURE_GROUP:
|
||||
{
|
||||
idx = re_ctx_p->num_of_captures++;
|
||||
JERRY_TRACE_MSG ("Compile a capture group start (idx: %u)\n", (unsigned int) idx);
|
||||
|
||||
ret_value = re_parse_alternative (re_ctx_p, false);
|
||||
|
||||
if (ecma_is_value_empty (ret_value))
|
||||
{
|
||||
re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RE_TOK_START_NON_CAPTURE_GROUP:
|
||||
{
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
JERRY_TRACE_MSG ("Compile a non-capture group start (idx: %u)\n", (unsigned int) idx);
|
||||
|
||||
ret_value = re_parse_alternative (re_ctx_p, false);
|
||||
|
||||
if (ecma_is_value_empty (ret_value))
|
||||
{
|
||||
re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RE_TOK_CHAR:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile character token: %c, qmin: %u, qmax: %u\n",
|
||||
(char) re_ctx_p->current_token.value, (unsigned int) re_ctx_p->current_token.qmin,
|
||||
(unsigned int) re_ctx_p->current_token.qmax);
|
||||
|
||||
re_append_opcode (bc_ctx_p, RE_OP_CHAR);
|
||||
re_append_char (bc_ctx_p, re_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
|
||||
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_PERIOD:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a period\n");
|
||||
re_append_opcode (bc_ctx_p, RE_OP_PERIOD);
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ALTERNATIVE:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile an alternative\n");
|
||||
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
|
||||
re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE);
|
||||
alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a start assertion\n");
|
||||
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_END:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile an end assertion\n");
|
||||
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a word boundary assertion\n");
|
||||
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_NOT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a not word boundary assertion\n");
|
||||
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START_POS_LOOKAHEAD:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a positive lookahead assertion\n");
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS);
|
||||
|
||||
ret_value = re_parse_alternative (re_ctx_p, false);
|
||||
|
||||
if (ecma_is_value_empty (ret_value))
|
||||
{
|
||||
re_append_opcode (bc_ctx_p, RE_OP_MATCH);
|
||||
|
||||
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START_NEG_LOOKAHEAD:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a negative lookahead assertion\n");
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG);
|
||||
|
||||
ret_value = re_parse_alternative (re_ctx_p, false);
|
||||
|
||||
if (ecma_is_value_empty (ret_value))
|
||||
{
|
||||
re_append_opcode (bc_ctx_p, RE_OP_MATCH);
|
||||
|
||||
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RE_TOK_BACKREFERENCE:
|
||||
{
|
||||
uint32_t backref = (uint32_t) re_ctx_p->current_token.value;
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
|
||||
if (backref > re_ctx_p->highest_backref)
|
||||
{
|
||||
re_ctx_p->highest_backref = backref;
|
||||
}
|
||||
|
||||
JERRY_TRACE_MSG ("Compile a backreference: %u\n", (unsigned int) backref);
|
||||
re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE);
|
||||
re_append_u32 (bc_ctx_p, backref);
|
||||
|
||||
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_DIGIT:
|
||||
case RE_TOK_NOT_DIGIT:
|
||||
case RE_TOK_WHITE:
|
||||
case RE_TOK_NOT_WHITE:
|
||||
case RE_TOK_WORD_CHAR:
|
||||
case RE_TOK_NOT_WORD_CHAR:
|
||||
case RE_TOK_START_CHAR_CLASS:
|
||||
case RE_TOK_START_INV_CHAR_CLASS:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a character class\n");
|
||||
re_append_opcode (bc_ctx_p,
|
||||
re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS
|
||||
? RE_OP_INV_CHAR_CLASS
|
||||
: RE_OP_CHAR_CLASS);
|
||||
uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
|
||||
|
||||
ECMA_TRY_CATCH (empty,
|
||||
re_parse_char_class (re_ctx_p->parser_ctx_p,
|
||||
re_append_char_class,
|
||||
re_ctx_p,
|
||||
&(re_ctx_p->current_token)),
|
||||
ret_value);
|
||||
re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes);
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
|
||||
ECMA_FINALIZE (empty);
|
||||
|
||||
break;
|
||||
}
|
||||
case RE_TOK_END_GROUP:
|
||||
{
|
||||
JERRY_TRACE_MSG ("Compile a group end\n");
|
||||
|
||||
if (expect_eof)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of paren."));
|
||||
}
|
||||
else
|
||||
{
|
||||
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
|
||||
should_loop = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_EOF:
|
||||
{
|
||||
if (!expect_eof)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern."));
|
||||
}
|
||||
else
|
||||
{
|
||||
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
|
||||
should_loop = false;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected RegExp token."));
|
||||
break;
|
||||
}
|
||||
}
|
||||
ECMA_FINALIZE (empty);
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* re_parse_alternative */
|
||||
|
||||
/**
|
||||
* Search for the given pattern in the RegExp cache
|
||||
*
|
||||
* @return index of bytecode in cache - if found
|
||||
* RE_CACHE_SIZE - otherwise
|
||||
*/
|
||||
static uint8_t
|
||||
re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, /**< pattern string */
|
||||
uint16_t flags) /**< flags */
|
||||
{
|
||||
uint8_t free_idx = RE_CACHE_SIZE;
|
||||
|
||||
for (uint8_t idx = 0u; idx < RE_CACHE_SIZE; idx++)
|
||||
{
|
||||
const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[idx];
|
||||
|
||||
if (cached_bytecode_p != NULL)
|
||||
{
|
||||
ecma_string_t *cached_pattern_str_p;
|
||||
cached_pattern_str_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, cached_bytecode_p->pattern_cp);
|
||||
|
||||
if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags
|
||||
&& ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p))
|
||||
{
|
||||
JERRY_TRACE_MSG ("RegExp is found in cache\n");
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* mark as free, so it can be overridden if the cache is full */
|
||||
free_idx = idx;
|
||||
}
|
||||
}
|
||||
|
||||
JERRY_TRACE_MSG ("RegExp is NOT found in cache\n");
|
||||
return free_idx;
|
||||
} /* re_find_bytecode_in_cache */
|
||||
|
||||
/**
|
||||
* Run gerbage collection in RegExp cache
|
||||
*/
|
||||
void
|
||||
re_cache_gc_run ()
|
||||
{
|
||||
for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++)
|
||||
{
|
||||
const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[i];
|
||||
|
||||
if (cached_bytecode_p != NULL
|
||||
&& cached_bytecode_p->header.refs == 1)
|
||||
{
|
||||
/* Only the cache has reference for the bytecode */
|
||||
ecma_bytecode_deref ((ecma_compiled_code_t *) cached_bytecode_p);
|
||||
JERRY_CONTEXT (re_cache)[i] = NULL;
|
||||
}
|
||||
}
|
||||
} /* re_cache_gc_run */
|
||||
|
||||
/**
|
||||
* Compilation of RegExp bytecode
|
||||
*
|
||||
* @return empty ecma value - if bytecode was compiled successfully
|
||||
* error ecma value - otherwise
|
||||
*
|
||||
* Returned value must be freed with ecma_free_value
|
||||
*/
|
||||
ecma_value_t
|
||||
re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] pointer to bytecode */
|
||||
ecma_string_t *pattern_str_p, /**< pattern */
|
||||
uint16_t flags) /**< flags */
|
||||
{
|
||||
ecma_value_t ret_value = ecma_make_simple_value (ECMA_SIMPLE_VALUE_EMPTY);
|
||||
uint8_t cache_idx = re_find_bytecode_in_cache (pattern_str_p, flags);
|
||||
|
||||
if (cache_idx < RE_CACHE_SIZE)
|
||||
{
|
||||
*out_bytecode_p = JERRY_CONTEXT (re_cache)[cache_idx];
|
||||
|
||||
if (*out_bytecode_p != NULL)
|
||||
{
|
||||
ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p);
|
||||
return ret_value;
|
||||
}
|
||||
}
|
||||
|
||||
/* not in the RegExp cache, so compile it */
|
||||
re_compiler_ctx_t re_ctx;
|
||||
re_ctx.flags = flags;
|
||||
re_ctx.highest_backref = 0;
|
||||
re_ctx.num_of_non_captures = 0;
|
||||
|
||||
re_bytecode_ctx_t bc_ctx;
|
||||
bc_ctx.block_start_p = NULL;
|
||||
bc_ctx.block_end_p = NULL;
|
||||
bc_ctx.current_p = NULL;
|
||||
|
||||
re_ctx.bytecode_ctx_p = &bc_ctx;
|
||||
|
||||
ECMA_STRING_TO_UTF8_STRING (pattern_str_p, pattern_start_p, pattern_start_size);
|
||||
|
||||
re_parser_ctx_t parser_ctx;
|
||||
parser_ctx.input_start_p = pattern_start_p;
|
||||
parser_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p;
|
||||
parser_ctx.input_end_p = pattern_start_p + pattern_start_size;
|
||||
parser_ctx.num_of_groups = -1;
|
||||
re_ctx.parser_ctx_p = &parser_ctx;
|
||||
|
||||
/* 1. Parse RegExp pattern */
|
||||
re_ctx.num_of_captures = 1;
|
||||
re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
|
||||
|
||||
ECMA_TRY_CATCH (empty, re_parse_alternative (&re_ctx, true), ret_value);
|
||||
|
||||
/* 2. Check for invalid backreference */
|
||||
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ("Invalid backreference.\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
|
||||
re_append_opcode (&bc_ctx, RE_OP_EOF);
|
||||
|
||||
/* 3. Insert extra informations for bytecode header */
|
||||
re_compiled_code_t re_compiled_code;
|
||||
|
||||
re_compiled_code.header.refs = 1;
|
||||
re_compiled_code.header.status_flags = re_ctx.flags;
|
||||
ecma_ref_ecma_string (pattern_str_p);
|
||||
ECMA_SET_NON_NULL_POINTER (re_compiled_code.pattern_cp, pattern_str_p);
|
||||
re_compiled_code.num_of_captures = re_ctx.num_of_captures * 2;
|
||||
re_compiled_code.num_of_non_captures = re_ctx.num_of_non_captures;
|
||||
|
||||
re_bytecode_list_insert (&bc_ctx,
|
||||
0,
|
||||
(uint8_t *) &re_compiled_code,
|
||||
sizeof (re_compiled_code_t));
|
||||
}
|
||||
|
||||
ECMA_FINALIZE (empty);
|
||||
|
||||
ECMA_FINALIZE_UTF8_STRING (pattern_start_p, pattern_start_size);
|
||||
|
||||
size_t byte_code_size = (size_t) (bc_ctx.block_end_p - bc_ctx.block_start_p);
|
||||
|
||||
if (!ecma_is_value_empty (ret_value))
|
||||
{
|
||||
/* Compilation failed, free bytecode. */
|
||||
JERRY_TRACE_MSG ("RegExp compilation failed!\n");
|
||||
jmem_heap_free_block (bc_ctx.block_start_p, byte_code_size);
|
||||
*out_bytecode_p = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef REGEXP_DUMP_BYTE_CODE
|
||||
if (JERRY_CONTEXT (jerry_init_flags) & JERRY_INIT_SHOW_REGEXP_OPCODES)
|
||||
{
|
||||
re_dump_bytecode (&bc_ctx);
|
||||
}
|
||||
#endif /* REGEXP_DUMP_BYTE_CODE */
|
||||
|
||||
/* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */
|
||||
JERRY_ASSERT (bc_ctx.block_start_p != NULL);
|
||||
*out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p;
|
||||
|
||||
((re_compiled_code_t *) bc_ctx.block_start_p)->header.size = (uint16_t) (byte_code_size >> JMEM_ALIGNMENT_LOG);
|
||||
|
||||
if (cache_idx == RE_CACHE_SIZE)
|
||||
{
|
||||
if (JERRY_CONTEXT (re_cache_idx) == RE_CACHE_SIZE)
|
||||
{
|
||||
JERRY_CONTEXT (re_cache_idx) = 0;
|
||||
}
|
||||
|
||||
JERRY_TRACE_MSG ("RegExp cache is full! Remove the element on idx: %d\n", JERRY_CONTEXT (re_cache_idx));
|
||||
|
||||
cache_idx = JERRY_CONTEXT (re_cache_idx)++;
|
||||
|
||||
/* The garbage collector might run during the byte code
|
||||
* allocations above and it may free this entry. */
|
||||
if (JERRY_CONTEXT (re_cache)[cache_idx] != NULL)
|
||||
{
|
||||
ecma_bytecode_deref ((ecma_compiled_code_t *) JERRY_CONTEXT (re_cache)[cache_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
JERRY_TRACE_MSG ("Insert bytecode into RegExp cache (idx: %d).\n", cache_idx);
|
||||
ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p);
|
||||
JERRY_CONTEXT (re_cache)[cache_idx] = *out_bytecode_p;
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* re_compile_bytecode */
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
62
third_party/jerryscript/jerry-core/parser/regexp/re-compiler.h
vendored
Normal file
62
third_party/jerryscript/jerry-core/parser/regexp/re-compiler.h
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015-2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef RE_COMPILER_H
|
||||
#define RE_COMPILER_H
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
#include "ecma-globals.h"
|
||||
#include "re-bytecode.h"
|
||||
#include "re-parser.h"
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_compiler Compiler
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Context of RegExp compiler
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint16_t flags; /**< RegExp flags */
|
||||
uint32_t num_of_captures; /**< number of capture groups */
|
||||
uint32_t num_of_non_captures; /**< number of non-capture groups */
|
||||
uint32_t highest_backref; /**< highest backreference */
|
||||
re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */
|
||||
re_token_t current_token; /**< current token */
|
||||
re_parser_ctx_t *parser_ctx_p; /**< pointer of RegExp parser context */
|
||||
} re_compiler_ctx_t;
|
||||
|
||||
ecma_value_t
|
||||
re_compile_bytecode (const re_compiled_code_t **, ecma_string_t *, uint16_t);
|
||||
|
||||
void re_cache_gc_run ();
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
||||
#endif /* !RE_COMPILER_H */
|
920
third_party/jerryscript/jerry-core/parser/regexp/re-parser.c
vendored
Normal file
920
third_party/jerryscript/jerry-core/parser/regexp/re-parser.c
vendored
Normal file
|
@ -0,0 +1,920 @@
|
|||
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015-2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecma-exceptions.h"
|
||||
#include "ecma-globals.h"
|
||||
#include "ecma-try-catch-macro.h"
|
||||
#include "jrt-libc-includes.h"
|
||||
#include "lit-char-helpers.h"
|
||||
#include "re-compiler.h"
|
||||
#include "re-parser.h"
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_parser Parser
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lookup a character in the input string.
|
||||
*
|
||||
* @return true, if lookup number of characters ahead are hex digits
|
||||
* false, otherwise
|
||||
*/
|
||||
static bool
|
||||
re_hex_lookup (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
|
||||
uint32_t lookup) /**< size of lookup */
|
||||
{
|
||||
bool is_digit = true;
|
||||
const lit_utf8_byte_t *curr_p = parser_ctx_p->input_curr_p;
|
||||
|
||||
for (uint32_t i = 0; is_digit && i < lookup; i++)
|
||||
{
|
||||
if (curr_p < parser_ctx_p->input_end_p)
|
||||
{
|
||||
is_digit = lit_char_is_hex_digit (*curr_p++);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return is_digit;
|
||||
} /* re_hex_lookup */
|
||||
|
||||
/**
|
||||
* Consume non greedy (question mark) character if present.
|
||||
*
|
||||
* @return true, if non-greedy character found
|
||||
* false, otherwise
|
||||
*/
|
||||
static inline bool __attr_always_inline___
|
||||
re_parse_non_greedy_char (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
|
||||
&& *parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} /* re_parse_non_greedy_char */
|
||||
|
||||
/**
|
||||
* Parse a max 3 digit long octal number from input string iterator.
|
||||
*
|
||||
* @return uint32_t - parsed octal number
|
||||
*/
|
||||
static uint32_t
|
||||
re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
|
||||
{
|
||||
uint32_t number = 0;
|
||||
for (int index = 0;
|
||||
index < 3
|
||||
&& parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
|
||||
&& lit_char_is_octal_digit (*parser_ctx_p->input_curr_p);
|
||||
index++)
|
||||
{
|
||||
number = number * 8 + lit_char_hex_to_int (*parser_ctx_p->input_curr_p++);
|
||||
}
|
||||
|
||||
return number;
|
||||
} /* re_parse_octal */
|
||||
|
||||
/**
|
||||
* Parse RegExp iterators
|
||||
*
|
||||
* @return empty ecma value - if parsed successfully
|
||||
* error ecma value - otherwise
|
||||
*
|
||||
* Returned value must be freed with ecma_free_value
|
||||
*/
|
||||
static ecma_value_t
|
||||
re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
|
||||
re_token_t *re_token_p) /**< [out] output token */
|
||||
{
|
||||
ecma_value_t ret_value = ecma_make_simple_value (ECMA_SIMPLE_VALUE_EMPTY);
|
||||
|
||||
re_token_p->qmin = 1;
|
||||
re_token_p->qmax = 1;
|
||||
re_token_p->greedy = true;
|
||||
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ret_value;
|
||||
}
|
||||
|
||||
ecma_char_t ch = *parser_ctx_p->input_curr_p;
|
||||
|
||||
switch (ch)
|
||||
{
|
||||
case LIT_CHAR_QUESTION:
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
re_token_p->qmin = 0;
|
||||
re_token_p->qmax = 1;
|
||||
re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_ASTERISK:
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
re_token_p->qmin = 0;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_PLUS:
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
re_token_p->qmin = 1;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_LEFT_BRACE:
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
uint32_t qmin = 0;
|
||||
uint32_t qmax = RE_ITERATOR_INFINITE;
|
||||
uint32_t digits = 0;
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier"));
|
||||
}
|
||||
|
||||
ch = *parser_ctx_p->input_curr_p++;
|
||||
|
||||
if (lit_char_is_decimal_digit (ch))
|
||||
{
|
||||
if (digits >= ECMA_NUMBER_MAX_DIGITS)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: too many digits."));
|
||||
}
|
||||
digits++;
|
||||
qmin = qmin * 10 + lit_char_hex_to_int (ch);
|
||||
}
|
||||
else if (ch == LIT_CHAR_COMMA)
|
||||
{
|
||||
if (qmax != RE_ITERATOR_INFINITE)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: double comma."));
|
||||
}
|
||||
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier"));
|
||||
}
|
||||
|
||||
if (*parser_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE)
|
||||
{
|
||||
if (digits == 0)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits."));
|
||||
}
|
||||
|
||||
parser_ctx_p->input_curr_p++;
|
||||
re_token_p->qmin = qmin;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
break;
|
||||
}
|
||||
qmax = qmin;
|
||||
qmin = 0;
|
||||
digits = 0;
|
||||
}
|
||||
else if (ch == LIT_CHAR_RIGHT_BRACE)
|
||||
{
|
||||
if (digits == 0)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits."));
|
||||
}
|
||||
|
||||
if (qmax != RE_ITERATOR_INFINITE)
|
||||
{
|
||||
re_token_p->qmin = qmax;
|
||||
re_token_p->qmax = qmin;
|
||||
}
|
||||
else
|
||||
{
|
||||
re_token_p->qmin = qmin;
|
||||
re_token_p->qmax = qmin;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: unknown char."));
|
||||
}
|
||||
}
|
||||
|
||||
re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
JERRY_ASSERT (ecma_is_value_empty (ret_value));
|
||||
|
||||
if (re_token_p->qmin > re_token_p->qmax)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: qmin > qmax."));
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* re_parse_iterator */
|
||||
|
||||
/**
|
||||
* Count the number of groups in pattern
|
||||
*/
|
||||
static void
|
||||
re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
|
||||
{
|
||||
int char_class_in = 0;
|
||||
parser_ctx_p->num_of_groups = 0;
|
||||
const lit_utf8_byte_t *curr_p = parser_ctx_p->input_start_p;
|
||||
|
||||
while (curr_p < parser_ctx_p->input_end_p)
|
||||
{
|
||||
switch (*curr_p++)
|
||||
{
|
||||
case LIT_CHAR_BACKSLASH:
|
||||
{
|
||||
lit_utf8_incr (&curr_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_LEFT_SQUARE:
|
||||
{
|
||||
char_class_in++;
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_RIGHT_SQUARE:
|
||||
{
|
||||
if (char_class_in)
|
||||
{
|
||||
char_class_in--;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_LEFT_PAREN:
|
||||
{
|
||||
if (curr_p < parser_ctx_p->input_end_p
|
||||
&& *curr_p != LIT_CHAR_QUESTION
|
||||
&& !char_class_in)
|
||||
{
|
||||
parser_ctx_p->num_of_groups++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} /* re_count_num_of_groups */
|
||||
|
||||
/**
|
||||
* Read the input pattern and parse the range of character class
|
||||
*
|
||||
* @return empty ecma value - if parsed successfully
|
||||
* error ecma value - otherwise
|
||||
*
|
||||
* Returned value must be freed with ecma_free_value
|
||||
*/
|
||||
ecma_value_t
|
||||
re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
|
||||
re_char_class_callback append_char_class, /**< callback function,
|
||||
* which adds the char-ranges
|
||||
* to the bytecode */
|
||||
void *re_ctx_p, /**< regexp compiler context */
|
||||
re_token_t *out_token_p) /**< [out] output token */
|
||||
{
|
||||
re_token_type_t token_type = ((re_compiler_ctx_t *) re_ctx_p)->current_token.type;
|
||||
out_token_p->qmax = out_token_p->qmin = 1;
|
||||
ecma_char_t start = LIT_CHAR_UNDEF;
|
||||
bool is_range = false;
|
||||
parser_ctx_p->num_of_classes = 0;
|
||||
|
||||
if (lit_utf8_peek_prev (parser_ctx_p->input_curr_p) != LIT_CHAR_LEFT_SQUARE)
|
||||
{
|
||||
lit_utf8_decr (&parser_ctx_p->input_curr_p);
|
||||
lit_utf8_decr (&parser_ctx_p->input_curr_p);
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string"));
|
||||
}
|
||||
|
||||
ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
|
||||
|
||||
if (ch == LIT_CHAR_RIGHT_SQUARE)
|
||||
{
|
||||
if (start != LIT_CHAR_UNDEF)
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_MINUS)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'"));
|
||||
}
|
||||
|
||||
if (start != LIT_CHAR_UNDEF
|
||||
&& !is_range
|
||||
&& *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE)
|
||||
{
|
||||
is_range = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (ch == LIT_CHAR_BACKSLASH)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'"));
|
||||
}
|
||||
|
||||
ch = *parser_ctx_p->input_curr_p++;
|
||||
|
||||
if (ch == LIT_CHAR_LOWERCASE_B)
|
||||
{
|
||||
ch = LIT_CHAR_BS;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_F)
|
||||
{
|
||||
ch = LIT_CHAR_FF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_N)
|
||||
{
|
||||
ch = LIT_CHAR_LF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_T)
|
||||
{
|
||||
ch = LIT_CHAR_TAB;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_R)
|
||||
{
|
||||
ch = LIT_CHAR_CR;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_V)
|
||||
{
|
||||
ch = LIT_CHAR_VTAB;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_C)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
|
||||
{
|
||||
ch = *parser_ctx_p->input_curr_p;
|
||||
|
||||
if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
|
||||
|| (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
|
||||
|| (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9))
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.10 (Point 3) */
|
||||
ch = (ch % 32);
|
||||
parser_ctx_p->input_curr_p++;
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = LIT_CHAR_LOWERCASE_C;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_X)
|
||||
{
|
||||
ecma_char_t code_unit;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'"));
|
||||
}
|
||||
|
||||
parser_ctx_p->input_curr_p += 2;
|
||||
append_char_class (re_ctx_p, code_unit, code_unit);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_U)
|
||||
{
|
||||
ecma_char_t code_unit;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'"));
|
||||
}
|
||||
|
||||
parser_ctx_p->input_curr_p += 4;
|
||||
append_char_class (re_ctx_p, code_unit, code_unit);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_D)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_D)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_S)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP);
|
||||
append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */
|
||||
append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */
|
||||
append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS);
|
||||
append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */
|
||||
append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */
|
||||
append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_S)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL);
|
||||
append_char_class (re_ctx_p, 0x1681UL, 0x180DUL);
|
||||
append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL);
|
||||
append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL);
|
||||
append_char_class (re_ctx_p, 0x2030UL, 0x205EUL);
|
||||
append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL);
|
||||
append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_W)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_W)
|
||||
{
|
||||
/* See ECMA-262 v5, 15.10.2.12 */
|
||||
append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1);
|
||||
append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX);
|
||||
ch = LIT_CHAR_UNDEF;
|
||||
}
|
||||
else if (lit_char_is_octal_digit ((ecma_char_t) ch)
|
||||
&& ch != LIT_CHAR_0)
|
||||
{
|
||||
parser_ctx_p->input_curr_p--;
|
||||
ch = (ecma_char_t) re_parse_octal (parser_ctx_p);
|
||||
}
|
||||
} /* ch == LIT_CHAR_BACKSLASH */
|
||||
|
||||
if (ch == LIT_CHAR_UNDEF)
|
||||
{
|
||||
if (start != LIT_CHAR_UNDEF)
|
||||
{
|
||||
if (is_range)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, invalid range"));
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
start = LIT_CHAR_UNDEF;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (start != LIT_CHAR_UNDEF)
|
||||
{
|
||||
if (is_range)
|
||||
{
|
||||
if (start > ch)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order"));
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, ch);
|
||||
start = LIT_CHAR_UNDEF;
|
||||
is_range = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
start = ch;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
start = ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (token_type == RE_TOK_START_CHAR_CLASS || token_type == RE_TOK_START_INV_CHAR_CLASS);
|
||||
|
||||
return re_parse_iterator (parser_ctx_p, out_token_p);
|
||||
} /* re_parse_char_class */
|
||||
|
||||
/**
|
||||
* Read the input pattern and parse the next token for the RegExp compiler
|
||||
*
|
||||
* @return empty ecma value - if parsed successfully
|
||||
* error ecma value - otherwise
|
||||
*
|
||||
* Returned value must be freed with ecma_free_value
|
||||
*/
|
||||
ecma_value_t
|
||||
re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
|
||||
re_token_t *out_token_p) /**< [out] output token */
|
||||
{
|
||||
ecma_value_t ret_value = ecma_make_simple_value (ECMA_SIMPLE_VALUE_EMPTY);
|
||||
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
out_token_p->type = RE_TOK_EOF;
|
||||
return ret_value;
|
||||
}
|
||||
|
||||
ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
|
||||
|
||||
switch (ch)
|
||||
{
|
||||
case LIT_CHAR_VLINE:
|
||||
{
|
||||
out_token_p->type = RE_TOK_ALTERNATIVE;
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_CIRCUMFLEX:
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_START;
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_DOLLAR_SIGN:
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_END;
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_DOT:
|
||||
{
|
||||
out_token_p->type = RE_TOK_PERIOD;
|
||||
ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_BACKSLASH:
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid regular experssion"));
|
||||
}
|
||||
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
|
||||
|
||||
if (ch == LIT_CHAR_LOWERCASE_B)
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_B)
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_F)
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_FF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_N)
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_LF;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_T)
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_TAB;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_R)
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_CR;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_V)
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_VTAB;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_C)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
|
||||
{
|
||||
ch = *parser_ctx_p->input_curr_p;
|
||||
|
||||
if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
|
||||
|| (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END))
|
||||
{
|
||||
out_token_p->value = (ch % 32);
|
||||
parser_ctx_p->input_curr_p++;
|
||||
}
|
||||
else
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_BACKSLASH;
|
||||
parser_ctx_p->input_curr_p--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
out_token_p->value = LIT_CHAR_BACKSLASH;
|
||||
parser_ctx_p->input_curr_p--;
|
||||
}
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_X
|
||||
&& re_hex_lookup (parser_ctx_p, 2))
|
||||
{
|
||||
ecma_char_t code_unit;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
|
||||
}
|
||||
|
||||
parser_ctx_p->input_curr_p += 2;
|
||||
out_token_p->value = code_unit;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_U
|
||||
&& re_hex_lookup (parser_ctx_p, 4))
|
||||
{
|
||||
ecma_char_t code_unit;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
|
||||
}
|
||||
|
||||
parser_ctx_p->input_curr_p += 4;
|
||||
out_token_p->value = code_unit;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_D)
|
||||
{
|
||||
out_token_p->type = RE_TOK_DIGIT;
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_D)
|
||||
{
|
||||
out_token_p->type = RE_TOK_NOT_DIGIT;
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_S)
|
||||
{
|
||||
out_token_p->type = RE_TOK_WHITE;
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_S)
|
||||
{
|
||||
out_token_p->type = RE_TOK_NOT_WHITE;
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_LOWERCASE_W)
|
||||
{
|
||||
out_token_p->type = RE_TOK_WORD_CHAR;
|
||||
break;
|
||||
}
|
||||
else if (ch == LIT_CHAR_UPPERCASE_W)
|
||||
{
|
||||
out_token_p->type = RE_TOK_NOT_WORD_CHAR;
|
||||
break;
|
||||
}
|
||||
else if (lit_char_is_decimal_digit (ch))
|
||||
{
|
||||
if (ch == LIT_CHAR_0)
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
|
||||
&& lit_char_is_decimal_digit (*parser_ctx_p->input_curr_p))
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape pattern error."));
|
||||
}
|
||||
|
||||
out_token_p->value = LIT_UNICODE_CODE_POINT_NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (parser_ctx_p->num_of_groups == -1)
|
||||
{
|
||||
re_count_num_of_groups (parser_ctx_p);
|
||||
}
|
||||
|
||||
if (parser_ctx_p->num_of_groups)
|
||||
{
|
||||
parser_ctx_p->input_curr_p--;
|
||||
uint32_t number = 0;
|
||||
int index = 0;
|
||||
|
||||
do
|
||||
{
|
||||
if (index >= RE_MAX_RE_DECESC_DIGITS)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape error: decimal escape too long."));
|
||||
return ret_value;
|
||||
}
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
ecma_char_t digit = *parser_ctx_p->input_curr_p++;
|
||||
|
||||
if (!lit_char_is_decimal_digit (digit))
|
||||
{
|
||||
parser_ctx_p->input_curr_p--;
|
||||
break;
|
||||
}
|
||||
number = number * 10 + lit_char_hex_to_int (digit);
|
||||
index++;
|
||||
}
|
||||
while (true);
|
||||
|
||||
if ((int) number <= parser_ctx_p->num_of_groups)
|
||||
{
|
||||
out_token_p->type = RE_TOK_BACKREFERENCE;
|
||||
}
|
||||
else
|
||||
/* Invalid backreference, fallback to octal */
|
||||
{
|
||||
/* Rewind to start of number. */
|
||||
parser_ctx_p->input_curr_p -= index;
|
||||
|
||||
/* Try to reparse as octal. */
|
||||
ecma_char_t digit = *parser_ctx_p->input_curr_p;
|
||||
|
||||
if (!lit_char_is_octal_digit (digit))
|
||||
{
|
||||
/* Not octal, keep digit character value. */
|
||||
number = digit;
|
||||
parser_ctx_p->input_curr_p++;
|
||||
}
|
||||
else
|
||||
{
|
||||
number = re_parse_octal (parser_ctx_p);
|
||||
}
|
||||
}
|
||||
out_token_p->value = number;
|
||||
}
|
||||
else
|
||||
/* Invalid backreference, fallback to octal if possible */
|
||||
{
|
||||
if (!lit_char_is_octal_digit (ch))
|
||||
{
|
||||
/* Not octal, keep character value. */
|
||||
out_token_p->value = ch;
|
||||
}
|
||||
else
|
||||
{
|
||||
parser_ctx_p->input_curr_p--;
|
||||
out_token_p->value = re_parse_octal (parser_ctx_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
out_token_p->value = ch;
|
||||
}
|
||||
|
||||
ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_LEFT_PAREN:
|
||||
{
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated group"));
|
||||
}
|
||||
|
||||
if (*parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
|
||||
{
|
||||
parser_ctx_p->input_curr_p++;
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid group"));
|
||||
}
|
||||
|
||||
ch = *parser_ctx_p->input_curr_p++;
|
||||
|
||||
if (ch == LIT_CHAR_EQUALS)
|
||||
{
|
||||
/* (?= */
|
||||
out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD;
|
||||
}
|
||||
else if (ch == LIT_CHAR_EXCLAMATION)
|
||||
{
|
||||
/* (?! */
|
||||
out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD;
|
||||
}
|
||||
else if (ch == LIT_CHAR_COLON)
|
||||
{
|
||||
/* (?: */
|
||||
out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP;
|
||||
}
|
||||
else
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid group"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ( */
|
||||
out_token_p->type = RE_TOK_START_CAPTURE_GROUP;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_RIGHT_PAREN:
|
||||
{
|
||||
out_token_p->type = RE_TOK_END_GROUP;
|
||||
ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_LEFT_SQUARE:
|
||||
{
|
||||
out_token_p->type = RE_TOK_START_CHAR_CLASS;
|
||||
|
||||
if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class"));
|
||||
}
|
||||
|
||||
if (*parser_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX)
|
||||
{
|
||||
out_token_p->type = RE_TOK_START_INV_CHAR_CLASS;
|
||||
parser_ctx_p->input_curr_p++;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case LIT_CHAR_QUESTION:
|
||||
case LIT_CHAR_ASTERISK:
|
||||
case LIT_CHAR_PLUS:
|
||||
case LIT_CHAR_LEFT_BRACE:
|
||||
{
|
||||
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
|
||||
}
|
||||
case LIT_CHAR_NULL:
|
||||
{
|
||||
out_token_p->type = RE_TOK_EOF;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
out_token_p->value = ch;
|
||||
ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* re_parse_next_token */
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
117
third_party/jerryscript/jerry-core/parser/regexp/re-parser.h
vendored
Normal file
117
third_party/jerryscript/jerry-core/parser/regexp/re-parser.h
vendored
Normal file
|
@ -0,0 +1,117 @@
|
|||
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015-2016 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef RE_PARSER_H
|
||||
#define RE_PARSER_H
|
||||
|
||||
#ifndef CONFIG_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser Regular expression
|
||||
* @{
|
||||
*
|
||||
* \addtogroup regexparser_bytecode Bytecode
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* RegExp token type definitions
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
RE_TOK_EOF, /**< EOF */
|
||||
RE_TOK_BACKREFERENCE, /**< "\[0..9]" */
|
||||
RE_TOK_CHAR, /**< any character */
|
||||
RE_TOK_ALTERNATIVE, /**< "|" */
|
||||
RE_TOK_ASSERT_START, /**< "^" */
|
||||
RE_TOK_ASSERT_END, /**< "$" */
|
||||
RE_TOK_PERIOD, /**< "." */
|
||||
RE_TOK_START_CAPTURE_GROUP, /**< "(" */
|
||||
RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */
|
||||
RE_TOK_END_GROUP, /**< ")" */
|
||||
RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< "(?=" */
|
||||
RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< "(?!" */
|
||||
RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */
|
||||
RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
|
||||
RE_TOK_DIGIT, /**< "\d" */
|
||||
RE_TOK_NOT_DIGIT, /**< "\D" */
|
||||
RE_TOK_WHITE, /**< "\s" */
|
||||
RE_TOK_NOT_WHITE, /**< "\S" */
|
||||
RE_TOK_WORD_CHAR, /**< "\w" */
|
||||
RE_TOK_NOT_WORD_CHAR, /**< "\W" */
|
||||
RE_TOK_START_CHAR_CLASS, /**< "[ ]" */
|
||||
RE_TOK_START_INV_CHAR_CLASS, /**< "[^ ]" */
|
||||
} re_token_type_t;
|
||||
|
||||
/**
|
||||
* @}
|
||||
*
|
||||
* \addtogroup regexparser_parser Parser
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* RegExp constant of infinite
|
||||
*/
|
||||
#define RE_ITERATOR_INFINITE ((uint32_t) - 1)
|
||||
|
||||
/**
|
||||
* Maximum number of decimal escape digits
|
||||
*/
|
||||
#define RE_MAX_RE_DECESC_DIGITS 9
|
||||
|
||||
/**
|
||||
* RegExp token type
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
re_token_type_t type; /**< type of the token */
|
||||
uint32_t value; /**< value of the token */
|
||||
uint32_t qmin; /**< minimum number of token iterations */
|
||||
uint32_t qmax; /**< maximum number of token iterations */
|
||||
bool greedy; /**< type of iteration */
|
||||
} re_token_t;
|
||||
|
||||
/**
|
||||
* RegExp parser context
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
const lit_utf8_byte_t *input_start_p; /**< start of input pattern */
|
||||
const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */
|
||||
const lit_utf8_byte_t *input_end_p; /**< end of input pattern */
|
||||
int num_of_groups; /**< number of groups */
|
||||
uint32_t num_of_classes; /**< number of character classes */
|
||||
} re_parser_ctx_t;
|
||||
|
||||
typedef void (*re_char_class_callback) (void *re_ctx_p, ecma_char_t start, ecma_char_t end);
|
||||
|
||||
ecma_value_t
|
||||
re_parse_char_class (re_parser_ctx_t *, re_char_class_callback, void *, re_token_t *);
|
||||
|
||||
ecma_value_t
|
||||
re_parse_next_token (re_parser_ctx_t *, re_token_t *);
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* !CONFIG_DISABLE_REGEXP_BUILTIN */
|
||||
#endif /* !RE_PARSER_H */
|
Loading…
Add table
Add a link
Reference in a new issue