35
35
SystemContent ,
36
36
ToolDescription ,
37
37
load_harmony_encoding ,
38
+ load_harmony_encoding_from_file ,
38
39
)
39
40
from pydantic import ValidationError
40
41
@@ -949,3 +950,67 @@ def test_streamable_parser_tool_call_with_constrain_adjacent():
949
950
]
950
951
951
952
assert parser .messages == expected
953
+
954
+
955
+ def test_load_harmony_encoding_from_file (tmp_path ):
956
+ import os
957
+ from openai_harmony import load_harmony_encoding_from_file
958
+
959
+ cache_dir = os .environ .get ("TIKTOKEN_RS_CACHE_DIR" )
960
+ if not cache_dir :
961
+ cache_dir = os .path .join (os .path .expanduser ("~" ), ".cache" , "tiktoken-rs-cache" )
962
+ import hashlib
963
+ url = "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
964
+ cache_key = hashlib .sha1 (url .encode ()).hexdigest ()
965
+ vocab_file = os .path .join (cache_dir , cache_key )
966
+ if not os .path .exists (vocab_file ):
967
+ import pytest
968
+ pytest .skip ("No local vocab file available for offline test" )
969
+
970
+ special_tokens = [
971
+ ("<|startoftext|>" , 199998 ),
972
+ ("<|endoftext|>" , 199999 ),
973
+ ("<|reserved_200000|>" , 200000 ),
974
+ ("<|reserved_200001|>" , 200001 ),
975
+ ("<|return|>" , 200002 ),
976
+ ("<|constrain|>" , 200003 ),
977
+ ("<|reserved_200004|>" , 200004 ),
978
+ ("<|channel|>" , 200005 ),
979
+ ("<|start|>" , 200006 ),
980
+ ("<|end|>" , 200007 ),
981
+ ("<|message|>" , 200008 ),
982
+ ("<|reserved_200009|>" , 200009 ),
983
+ ("<|reserved_200010|>" , 200010 ),
984
+ ("<|reserved_200011|>" , 200011 ),
985
+ ("<|call|>" , 200012 ),
986
+ ("<|reserved_200013|>" , 200013 ),
987
+ ]
988
+ pattern = "|" .join ([
989
+ "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
990
+ "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
991
+ "\\ p{N}{1,3}" ,
992
+ " ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
993
+ "\\ s*[\\ r\\ n]+" ,
994
+ "\\ s+(?!\\ S)" ,
995
+ "\\ s+" ,
996
+ ])
997
+ n_ctx = 8192
998
+ max_message_tokens = 4096
999
+ max_action_length = 256
1000
+ expected_hash = "446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d"
1001
+
1002
+ encoding = load_harmony_encoding_from_file (
1003
+ name = "test_local" ,
1004
+ vocab_file = vocab_file ,
1005
+ special_tokens = special_tokens ,
1006
+ pattern = pattern ,
1007
+ n_ctx = n_ctx ,
1008
+ max_message_tokens = max_message_tokens ,
1009
+ max_action_length = max_action_length ,
1010
+ expected_hash = expected_hash ,
1011
+ )
1012
+
1013
+ text = "Hello world!"
1014
+ tokens = encoding .encode (text )
1015
+ decoded = encoding .decode (tokens )
1016
+ assert decoded .startswith ("Hello world" )
0 commit comments