From 8a4645f0f9d9c20a8ecd3381dbb88ce5f56084cc Mon Sep 17 00:00:00 2001 From: Dmytro Dzhulgakov Date: Fri, 8 Aug 2025 17:50:17 -0700 Subject: [PATCH 1/3] Fix tokenization of <|constrain|> content type in rendering (#47) --- src/encoding.rs | 17 ++++++++++++++++- tests/test_harmony.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index afe1fce..d57f8ec 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -835,7 +835,22 @@ impl Render for HarmonyEncoding { // finally content type if let Some(content_type) = &message.content_type { - self.render_text_into(format!(" {content_type}"), into)?; + // <|constrain|> is a unique case which needs to be tokenized as a special token + if let Some(constrain_marker) = self.mapped_format_token(FormattingToken::ConstrainedFormat) { + if content_type.starts_with(constrain_marker) { + // Render the space, then the constrain marker as a special token, then the rest as text (if any) + self.render_text_into(" ", into)?; + self.render_formatting_token_into(FormattingToken::ConstrainedFormat, into)?; + let rest = &content_type[constrain_marker.len()..]; + if !rest.is_empty() { + self.render_text_into(rest, into)?; + } + } else { + self.render_text_into(format!(" {content_type}"), into)?; + } + } else { + self.render_text_into(format!(" {content_type}"), into)?; + } } self.render_formatting_token_into(FormattingToken::Message, into)?; diff --git a/tests/test_harmony.py b/tests/test_harmony.py index 07d5562..dd34e81 100644 --- a/tests/test_harmony.py +++ b/tests/test_harmony.py @@ -233,6 +233,36 @@ def test_simple_tool_call(encoding_name): assert parsed == expected +@pytest.mark.parametrize( + "encoding_name", + [ + HarmonyEncodingName.HARMONY_GPT_OSS, + ], +) +def test_tool_call_with_constrain_tokenized_correctly(encoding_name): + """ + Despite passing <|constrain|> as a string in "content_type" it has to be kept as a special token. + """ + encoding = load_harmony_encoding(encoding_name) + text = ( + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + ' <|constrain|>json<|message|>{"location": "Tokyo"}<|call|>' + ) + tokens = encoding.encode(text, allowed_special="all") + parsed = encoding.parse_messages_from_completion_tokens(tokens, role=None) + expected = [ + Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}') + .with_channel("commentary") + .with_recipient("functions.get_weather") + .with_content_type("<|constrain|>json"), + ] + assert parsed == expected + + rendered = encoding.render_conversation(Conversation.from_messages(expected)) + assert text == encoding.decode_utf8(tokens) + assert rendered == tokens + + @pytest.mark.parametrize( "encoding_name", [ @@ -248,7 +278,7 @@ def test_tool_call_with_constrain_marker_adjacent(encoding_name): encoding = load_harmony_encoding(encoding_name) text = ( "<|start|>assistant to=functions.get_weather<|channel|>commentary" - '<|constrain|>json<|message|>{"location": "Tokyo"}<|end|>' + '<|constrain|>json<|message|>{"location": "Tokyo"}<|call|>' ) tokens = encoding.encode(text, allowed_special="all") parsed = encoding.parse_messages_from_completion_tokens(tokens, role=None) @@ -702,6 +732,8 @@ def test_does_not_drop_if_ongoing_analysis(): ) assert encoding.decode_utf8(tokens) == expected_output + # ensure that <|constrain|>json part is tokenized correctly as special tokens + assert encoding.encode(expected_output, allowed_special="all") == tokens def test_preserve_cot(): From 72079ca4971f326dddcd5ff62bfb7e2fff37a07a Mon Sep 17 00:00:00 2001 From: Dominik Kundel Date: Fri, 8 Aug 2025 18:27:00 -0700 Subject: [PATCH 2/3] Fix formatting (#51) * Fix formatting * fix format * fix clippy error * bump version --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/encoding.rs | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ce97b77..e0b0a71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1317,7 +1317,7 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "openai-harmony" -version = "0.0.3" +version = "0.0.4" dependencies = [ "anyhow", "base64", diff --git a/Cargo.toml b/Cargo.toml index 25d070c..23fa1ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "openai-harmony" -version = "0.0.3" +version = "0.0.4" edition = "2021" license = "Apache-2.0" repository = "https://github.com/openai/harmony" diff --git a/src/encoding.rs b/src/encoding.rs index d57f8ec..6a9305b 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -836,12 +836,13 @@ impl Render for HarmonyEncoding { // finally content type if let Some(content_type) = &message.content_type { // <|constrain|> is a unique case which needs to be tokenized as a special token - if let Some(constrain_marker) = self.mapped_format_token(FormattingToken::ConstrainedFormat) { - if content_type.starts_with(constrain_marker) { + if let Some(constrain_marker) = + self.mapped_format_token(FormattingToken::ConstrainedFormat) + { + if let Some(rest) = content_type.strip_prefix(constrain_marker) { // Render the space, then the constrain marker as a special token, then the rest as text (if any) self.render_text_into(" ", into)?; self.render_formatting_token_into(FormattingToken::ConstrainedFormat, into)?; - let rest = &content_type[constrain_marker.len()..]; if !rest.is_empty() { self.render_text_into(rest, into)?; } From 52176bfbec8f1d5e876b9793626a4153c71b3d4a Mon Sep 17 00:00:00 2001 From: nk <66731869+nicognaW@users.noreply.github.com> Date: Sat, 9 Aug 2025 09:28:08 +0800 Subject: [PATCH 3/3] Fix shadcn utils file in js demo (#20) --- .gitignore | 3 +++ demo/harmony-demo/src/lib/utils.ts | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 demo/harmony-demo/src/lib/utils.ts diff --git a/.gitignore b/.gitignore index c8f0442..bab5ca1 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,6 @@ docs/_build/ # Pyenv .python-version + +# Avoid ignoring shadcn utils +!demo/harmony-demo/src/lib diff --git a/demo/harmony-demo/src/lib/utils.ts b/demo/harmony-demo/src/lib/utils.ts new file mode 100644 index 0000000..bd0c391 --- /dev/null +++ b/demo/harmony-demo/src/lib/utils.ts @@ -0,0 +1,6 @@ +import { clsx, type ClassValue } from "clsx" +import { twMerge } from "tailwind-merge" + +export function cn(...inputs: ClassValue[]) { + return twMerge(clsx(inputs)) +}