diff --git a/.gitignore b/.gitignore index bab5ca1..c8f0442 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,3 @@ docs/_build/ # Pyenv .python-version - -# Avoid ignoring shadcn utils -!demo/harmony-demo/src/lib diff --git a/Cargo.lock b/Cargo.lock index e0b0a71..ce97b77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1317,7 +1317,7 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "openai-harmony" -version = "0.0.4" +version = "0.0.3" dependencies = [ "anyhow", "base64", diff --git a/Cargo.toml b/Cargo.toml index 23fa1ac..25d070c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "openai-harmony" -version = "0.0.4" +version = "0.0.3" edition = "2021" license = "Apache-2.0" repository = "https://github.com/openai/harmony" diff --git a/demo/harmony-demo/src/lib/utils.ts b/demo/harmony-demo/src/lib/utils.ts deleted file mode 100644 index bd0c391..0000000 --- a/demo/harmony-demo/src/lib/utils.ts +++ /dev/null @@ -1,6 +0,0 @@ -import { clsx, type ClassValue } from "clsx" -import { twMerge } from "tailwind-merge" - -export function cn(...inputs: ClassValue[]) { - return twMerge(clsx(inputs)) -} diff --git a/src/encoding.rs b/src/encoding.rs index 6a9305b..afe1fce 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -835,23 +835,7 @@ impl Render for HarmonyEncoding { // finally content type if let Some(content_type) = &message.content_type { - // <|constrain|> is a unique case which needs to be tokenized as a special token - if let Some(constrain_marker) = - self.mapped_format_token(FormattingToken::ConstrainedFormat) - { - if let Some(rest) = content_type.strip_prefix(constrain_marker) { - // Render the space, then the constrain marker as a special token, then the rest as text (if any) - self.render_text_into(" ", into)?; - self.render_formatting_token_into(FormattingToken::ConstrainedFormat, into)?; - if !rest.is_empty() { - self.render_text_into(rest, into)?; - } - } else { - self.render_text_into(format!(" {content_type}"), into)?; - } - } else { - self.render_text_into(format!(" {content_type}"), into)?; - } + self.render_text_into(format!(" {content_type}"), into)?; } self.render_formatting_token_into(FormattingToken::Message, into)?; diff --git a/tests/test_harmony.py b/tests/test_harmony.py index dd34e81..07d5562 100644 --- a/tests/test_harmony.py +++ b/tests/test_harmony.py @@ -233,36 +233,6 @@ def test_simple_tool_call(encoding_name): assert parsed == expected -@pytest.mark.parametrize( - "encoding_name", - [ - HarmonyEncodingName.HARMONY_GPT_OSS, - ], -) -def test_tool_call_with_constrain_tokenized_correctly(encoding_name): - """ - Despite passing <|constrain|> as a string in "content_type" it has to be kept as a special token. - """ - encoding = load_harmony_encoding(encoding_name) - text = ( - "<|start|>assistant to=functions.get_weather<|channel|>commentary" - ' <|constrain|>json<|message|>{"location": "Tokyo"}<|call|>' - ) - tokens = encoding.encode(text, allowed_special="all") - parsed = encoding.parse_messages_from_completion_tokens(tokens, role=None) - expected = [ - Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}') - .with_channel("commentary") - .with_recipient("functions.get_weather") - .with_content_type("<|constrain|>json"), - ] - assert parsed == expected - - rendered = encoding.render_conversation(Conversation.from_messages(expected)) - assert text == encoding.decode_utf8(tokens) - assert rendered == tokens - - @pytest.mark.parametrize( "encoding_name", [ @@ -278,7 +248,7 @@ def test_tool_call_with_constrain_marker_adjacent(encoding_name): encoding = load_harmony_encoding(encoding_name) text = ( "<|start|>assistant to=functions.get_weather<|channel|>commentary" - '<|constrain|>json<|message|>{"location": "Tokyo"}<|call|>' + '<|constrain|>json<|message|>{"location": "Tokyo"}<|end|>' ) tokens = encoding.encode(text, allowed_special="all") parsed = encoding.parse_messages_from_completion_tokens(tokens, role=None) @@ -732,8 +702,6 @@ def test_does_not_drop_if_ongoing_analysis(): ) assert encoding.decode_utf8(tokens) == expected_output - # ensure that <|constrain|>json part is tokenized correctly as special tokens - assert encoding.encode(expected_output, allowed_special="all") == tokens def test_preserve_cot():