From 98ac66539bba5d96c12e34a9bef03278676c8dba Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Thu, 31 Jul 2025 13:20:07 +0800 Subject: [PATCH 1/5] chore(trainers): add __init__ to fix python type check errors --- bindings/python/py_src/tokenizers/trainers/__init__.pyi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index d6c525718..fec4a5c82 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -109,6 +109,14 @@ class WordLevelTrainer(Trainer): special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. """ + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + ): + pass class WordPieceTrainer(Trainer): """ From 7ccf93c665337b9e25aa8b8a7f0f7409d7ec1dcd Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Thu, 28 Aug 2025 15:07:10 +0800 Subject: [PATCH 2/5] restore --- bindings/python/py_src/tokenizers/trainers/__init__.pyi | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index fec4a5c82..d6c525718 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -109,14 +109,6 @@ class WordLevelTrainer(Trainer): special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. """ - def __init__( - self, - vocab_size=30000, - min_frequency=0, - show_progress=True, - special_tokens=[], - ): - pass class WordPieceTrainer(Trainer): """ From 0547793e19282a3c35188ae4ab177735bd5078ba Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Fri, 29 Aug 2025 15:50:15 +0800 Subject: [PATCH 3/5] chore(trainer): add and improve trainer signature --- bindings/python/src/trainers.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index ef2c31e56..51775495f 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -312,7 +312,10 @@ impl PyBpeTrainer { } #[new] - #[pyo3(signature = (**kwargs), text_signature = None)] + #[pyo3( + signature = (**kwargs), + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix=None, end_of_word_suffix=None, max_token_length=None, words={})" + )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { @@ -518,7 +521,7 @@ impl PyWordPieceTrainer { #[new] #[pyo3( signature = (** kwargs), - text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)" + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix=\"##\", end_of_word_suffix=None)" )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); @@ -659,7 +662,9 @@ impl PyWordLevelTrainer { } #[new] - #[pyo3(signature = (**kwargs), text_signature = None)] + #[pyo3( + signature = (**kwargs), + text_signature = "(self, vocab_size=3000, min_frequency=0, show_progress=True, special_tokens=[])") pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); From 3d775329b8ad778956aece01298fc7147816de40 Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Fri, 29 Aug 2025 17:33:52 +0800 Subject: [PATCH 4/5] clean fix --- .../py_src/tokenizers/trainers/__init__.pyi | 17 +++++++++++++++++ bindings/python/src/trainers.rs | 5 +++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index d6c525718..58174ab44 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -45,6 +45,20 @@ class BpeTrainer(Trainer): highly repetitive tokens like `======` for wikipedia """ + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix=None, + end_of_word_suffix=None, + max_token_length=None, + words={}, + ): + pass class UnigramTrainer(Trainer): """ @@ -85,6 +99,7 @@ class UnigramTrainer(Trainer): vocab_size=8000, show_progress=True, special_tokens=[], + initial_alphabet=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, @@ -109,6 +124,8 @@ class WordLevelTrainer(Trainer): special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. """ + def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]): + pass class WordPieceTrainer(Trainer): """ diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 51775495f..2696a7dde 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -664,7 +664,8 @@ impl PyWordLevelTrainer { #[new] #[pyo3( signature = (**kwargs), - text_signature = "(self, vocab_size=3000, min_frequency=0, show_progress=True, special_tokens=[])") + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[])" + )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); @@ -831,7 +832,7 @@ impl PyUnigramTrainer { #[new] #[pyo3( signature = (**kwargs), - text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" + text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], initial_alphabet=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" )] pub fn new(kwargs: Option>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::unigram::UnigramTrainer::builder(); From 8da5806ba006c24f6719c288d875e5d74c73f794 Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Fri, 12 Sep 2025 16:57:27 +0800 Subject: [PATCH 5/5] chore(fmt): fix cargo fmt error --- bindings/python/src/trainers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 2696a7dde..30786862e 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -313,7 +313,7 @@ impl PyBpeTrainer { #[new] #[pyo3( - signature = (**kwargs), + signature = (**kwargs), text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix=None, end_of_word_suffix=None, max_token_length=None, words={})" )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { @@ -663,7 +663,7 @@ impl PyWordLevelTrainer { #[new] #[pyo3( - signature = (**kwargs), + signature = (**kwargs), text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[])" )] pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {