From 281e88bcc5cd11a69bba88310d4b479d4a5b32ce Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 20 Sep 2025 22:35:18 +0300 Subject: [PATCH 1/2] enable `PTH` rule --- docs/mmteb/create_points_table.py | 13 +++---- docs/mmteb/validate_points.py | 39 +++++++++---------- mteb/MTEB.py | 4 +- mteb/_evaluators/_download.py | 4 +- mteb/cli/generate_readme.py | 2 +- .../model_implementations/cache_wrapper.py | 8 ++-- .../model_implementations/evaclip_models.py | 4 +- .../ZeroShotClassification/eng/Country211.py | 6 +-- .../Image/ZeroShotClassification/eng/GTSRB.py | 6 +-- .../ZeroShotClassification/eng/Imagenet1k.py | 6 +-- .../eng/PatchCamelyon.py | 6 +-- mteb/tasks/Retrieval/eng/MLQuestions.py | 10 +++-- pyproject.toml | 7 ++-- tests/test_models/model_loading.py | 2 +- 14 files changed, 58 insertions(+), 59 deletions(-) diff --git a/docs/mmteb/create_points_table.py b/docs/mmteb/create_points_table.py index 0e4a2a04dd..740e7912c7 100644 --- a/docs/mmteb/create_points_table.py +++ b/docs/mmteb/create_points_table.py @@ -6,18 +6,15 @@ import pandas as pd -def load_data() -> pd.DataFrame: - file_path = Path(__file__).parent / "points" +def load_data(file_path: Path) -> pd.DataFrame: files = file_path.glob("*.jsonl") json_data = [] for file in files: - with open(file) as f: + with file.open() as f: for line in f: json_data.append(json.loads(line)) - - df = pd.DataFrame(json_data) - return df + return pd.DataFrame(json_data) def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None: @@ -30,7 +27,7 @@ def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None: md = df.to_markdown() # add title md = f"# Points\n\n_Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions.\n\n {md}" - with open(file_path, "w") as f: + with file_path.open("w") as f: f.write(md) @@ -38,5 +35,5 @@ def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None: file_path = Path(__file__).parent / "points" save_path = Path(__file__).parent / "points_table.md" - df = load_data() + df = load_data(file_path) save_to_markdown(df, save_path) diff --git a/docs/mmteb/validate_points.py b/docs/mmteb/validate_points.py index 5b2a4614ac..1975b396e1 100644 --- a/docs/mmteb/validate_points.py +++ b/docs/mmteb/validate_points.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -import os +from pathlib import Path from typing import Optional from jsonlines import Reader @@ -44,32 +44,29 @@ def check_max_points(obj: JsonObject, commit_n: str): # Function to validate JSONL files in a folder def validate_jsonl_files(folder_path): - for filename in os.listdir(folder_path): - if filename.endswith(".jsonl"): - file_path = os.path.join(folder_path, filename) - commit_n = os.path.splitext(filename)[0] - with open(file_path, encoding="utf-8") as file: + folder_path = Path(folder_path) + for file_path in folder_path.glob("*.jsonl"): + commit_n = file_path.stem + with file_path.open(encoding="utf-8") as file: + try: + # Read JSONL file + reader = Reader(file) + except Exception: + raise Exception("Error reading file:", file_path) + for line in reader: try: - # Read JSONL file - reader = Reader(file) - except Exception: - raise Exception("Error reading file:", file_path) - for line in reader: - try: - # Validate JSON object against schema - x = JsonObject(**line) - logging.debug(x) - check_max_points(x, commit_n) + # Validate JSON object against schema + x = JsonObject(**line) + logging.debug(x) + check_max_points(x, commit_n) - except ValidationError as e: - raise Exception( - "Validation Error in file:", file_path, line - ) from e + except ValidationError as e: + raise Exception("Validation Error in file:", file_path, line) from e # Main function def main(): - folder_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "points") + folder_path = Path(__file__).parent / "points" validate_jsonl_files(folder_path) diff --git a/mteb/MTEB.py b/mteb/MTEB.py index 7757043e9d..fdb97ed787 100644 --- a/mteb/MTEB.py +++ b/mteb/MTEB.py @@ -73,7 +73,7 @@ def __init__( self.benchmarks = tasks self.tasks = list(chain.from_iterable(self.tasks)) - self.err_logs_path = err_logs_path + self.err_logs_path = Path(err_logs_path) self.last_evaluated_splits = {} @property @@ -541,7 +541,7 @@ def run( logger.error( f"Please check all the error logs at: {self.err_logs_path}" ) - with open(self.err_logs_path, "a") as f_out: + with self.err_logs_path.open("a") as f_out: f_out.write(f"{datetime.now()} >>> {task.metadata.name}\n") f_out.write(traceback.format_exc()) f_out.write("\n\n") diff --git a/mteb/_evaluators/_download.py b/mteb/_evaluators/_download.py index 1c053c7153..8366c1cda1 100644 --- a/mteb/_evaluators/_download.py +++ b/mteb/_evaluators/_download.py @@ -1,5 +1,7 @@ from __future__ import annotations +from pathlib import Path + import requests import tqdm @@ -9,7 +11,7 @@ def download(url: str, fname: str): resp = requests.get(url, stream=True) total = int(resp.headers.get("content-length", 0)) with ( - open(fname, "wb") as file, + Path(fname).open("wb") as file, tqdm.tqdm( desc=fname, total=total, diff --git a/mteb/cli/generate_readme.py b/mteb/cli/generate_readme.py index 66a55ce1fa..7c77b719d3 100644 --- a/mteb/cli/generate_readme.py +++ b/mteb/cli/generate_readme.py @@ -144,7 +144,7 @@ def _merge_yamls( if not existing_readme.name.lower().endswith(".md"): raise ValueError("Readme file should be markdown and end with '.md'") - with open(existing_readme) as f: + with existing_readme.open() as f: existing_file = f.read() existing_yaml_dict, readme_end = _extract_yaml_and_content(existing_file) diff --git a/mteb/models/model_implementations/cache_wrapper.py b/mteb/models/model_implementations/cache_wrapper.py index b3644ac668..879ecee465 100644 --- a/mteb/models/model_implementations/cache_wrapper.py +++ b/mteb/models/model_implementations/cache_wrapper.py @@ -114,7 +114,7 @@ def _double_vectors_file(self) -> None: self.vectors = new_vectors def _save_dimension(self) -> None: - with open(self.dimension_file, "w") as f: + with self.dimension_file.open("w") as f: f.write(str(self.vector_dim)) logger.info( f"Saved vector dimension {self.vector_dim} to {self.dimension_file}" @@ -122,7 +122,7 @@ def _save_dimension(self) -> None: def _load_dimension(self) -> None: if self.dimension_file.exists(): - with open(self.dimension_file) as f: + with self.dimension_file.open() as f: self.vector_dim = int(f.read().strip()) logger.info( f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}" @@ -144,7 +144,7 @@ def save(self) -> None: for hash_, index in self.hash_to_index.items() } - with open(self.index_file, "w", encoding="utf-8") as f: + with self.index_file.open("w", encoding="utf-8") as f: json.dump(serializable_index, f, indent=2) self._save_dimension() logger.info(f"Saved VectorCacheMap to {self.directory}") @@ -156,7 +156,7 @@ def load(self, name: str | None = None) -> None: try: self._load_dimension() if self.index_file.exists() and self.vectors_file.exists(): - with open(self.index_file, encoding="utf-8") as f: + with self.index_file.open(encoding="utf-8") as f: loaded_index = json.load(f) self.hash_to_index = { str(hash_): int(index) # Ensure we maintain the correct types diff --git a/mteb/models/model_implementations/evaclip_models.py b/mteb/models/model_implementations/evaclip_models.py index 0763ccce91..207dc38341 100644 --- a/mteb/models/model_implementations/evaclip_models.py +++ b/mteb/models/model_implementations/evaclip_models.py @@ -1,5 +1,6 @@ from __future__ import annotations +from pathlib import Path from typing import Any import torch @@ -15,10 +16,9 @@ def evaclip_loader(model_name, **kwargs): try: - import os import sys - sys.path.insert(0, os.path.join(os.getcwd(), "EVA/EVA-CLIP/rei")) + sys.path.insert(0, str(Path.cwd() / "EVA" / "EVA-CLIP" / "rei")) from eva_clip import create_model_and_transforms, get_tokenizer except ImportError: diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py index a7949b8e19..c6dd599be3 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +from pathlib import Path from mteb.abstasks.AbsTaskAnyZeroShotClassification import ( AbsTaskAnyZeroShotClassification, @@ -47,8 +47,8 @@ class Country211ZeroShotClassification(AbsTaskAnyZeroShotClassification): label_column_name: str = "cls" def get_candidate_labels(self) -> list[str]: - path = os.path.dirname(__file__) - with open(os.path.join(path, "templates/Country211_labels.txt")) as f: + path = Path(__file__).parent / "templates" / "Country211_labels.txt" + with path.open() as f: labels = f.readlines() return [f"a photo showing the country of {c}." for c in labels] diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py index 30d6878ed2..925e44bec0 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +from pathlib import Path from mteb.abstasks.AbsTaskAnyZeroShotClassification import ( AbsTaskAnyZeroShotClassification, @@ -52,8 +52,8 @@ class GTSRBZeroShotClassification(AbsTaskAnyZeroShotClassification): label_column_name: str = "cls" def get_candidate_labels(self) -> list[str]: - path = os.path.dirname(__file__) - with open(os.path.join(path, "templates/GTSRB_labels.txt")) as f: + path = Path(__file__).parent / "templates" / "GTSRB_labels.txt" + with path.open() as f: labels = f.readlines() return [f"a close up photo of a '{c}' traffic sign." for c in labels] diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py index 3ed110a2ac..24626d7df2 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +from pathlib import Path from mteb.abstasks.AbsTaskAnyZeroShotClassification import ( AbsTaskAnyZeroShotClassification, @@ -48,8 +48,8 @@ class Imagenet1kZeroShotClassification(AbsTaskAnyZeroShotClassification): label_column_name: str = "cls" def get_candidate_labels(self) -> list[str]: - path = os.path.dirname(__file__) - with open(os.path.join(path, "templates/Imagenet1k_labels.txt")) as f: + path = Path(__file__).parent / "templates" / "Imagenet1k_labels.txt" + with path.open() as f: labels = f.readlines() return [f"a photo of {c}." for c in labels] diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py index 7727faef59..2b47cdddc3 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +from pathlib import Path from mteb.abstasks.AbsTaskAnyZeroShotClassification import ( AbsTaskAnyZeroShotClassification, @@ -60,8 +60,8 @@ class PatchCamelyonZeroShotClassification(AbsTaskAnyZeroShotClassification): label_column_name = "cls" def get_candidate_labels(self) -> list[str]: - path = os.path.dirname(__file__) - with open(os.path.join(path, "templates/PatchCamelyon_labels.txt")) as f: + path = Path(__file__).parent / "templates" / "PatchCamelyon_labels.txt" + with path.open() as f: labels = f.readlines() return [f"histopathology image of {c}" for c in labels] diff --git a/mteb/tasks/Retrieval/eng/MLQuestions.py b/mteb/tasks/Retrieval/eng/MLQuestions.py index 49519ebd20..6995228a21 100644 --- a/mteb/tasks/Retrieval/eng/MLQuestions.py +++ b/mteb/tasks/Retrieval/eng/MLQuestions.py @@ -1,6 +1,7 @@ from __future__ import annotations import csv +from pathlib import Path from huggingface_hub import snapshot_download @@ -80,8 +81,9 @@ def load_data(self) -> None: def _load_data_for_split(self, download_dir, split): queries, corpus, qrels = {}, {}, {} - dataset_path = f"{download_dir}/{split}.csv" - with open(dataset_path) as csvfile: + download_dir = Path(download_dir) + dataset_path = download_dir / f"{split}.csv" + with dataset_path.open() as csvfile: reader = csv.DictReader(csvfile) for i, row in enumerate(reader): query_id = f"Q{str(i)}" @@ -91,8 +93,8 @@ def _load_data_for_split(self, download_dir, split): qrels[query_id] = {f"C{doc_id}": 1} # Same corpus for all splits - corpus_path = f"{download_dir}/test_passages.csv" - with open(corpus_path) as csvfile: + corpus_path = download_dir / "test_passages.csv" + with corpus_path.open() as csvfile: reader = csv.DictReader(csvfile) for i, row in enumerate(reader): doc_id = f"C{str(i)}" diff --git a/pyproject.toml b/pyproject.toml index 2e4da63816..b027e26b68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,7 +161,6 @@ namespaces = false "mteb.tasks.Image.ZeroShotClassification.eng.templates" = ["*.txt"] [tool.ruff] - target-version = "py39" @@ -193,10 +192,9 @@ select = [ # would exclude: # "N806", # probably not worth it # "N812" # disallows: import torch.nn.functional as F which is standard - + "PTH", ] - ignore = [ "E501", # line too long "E741", # ambiguous variable name @@ -213,6 +211,9 @@ ignore = [ "C408", # don't use unecc. collection call, e.g. dict over {} ] +[tool.ruff.lint.per-file-ignores] +"scripts/*" = ["PTH"] + [tool.ruff.lint.flake8-implicit-str-concat] allow-multiline = false diff --git a/tests/test_models/model_loading.py b/tests/test_models/model_loading.py index a922b01c30..61f0d026ed 100644 --- a/tests/test_models/model_loading.py +++ b/tests/test_models/model_loading.py @@ -95,7 +95,7 @@ def parse_args(): elif args.model_name_file: all_model_names = [] if Path(args.model_name_file).exists(): - with open(args.model_name_file) as f: + with args.model_name_file.open() as f: all_model_names = f.read().strip().split() else: logging.warning( From c0914da12c9fa0457f6786886042b2f97a948019 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 20 Sep 2025 22:54:04 +0300 Subject: [PATCH 2/2] fix script --- tests/test_models/model_loading.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_models/model_loading.py b/tests/test_models/model_loading.py index 61f0d026ed..b1c330d73d 100644 --- a/tests/test_models/model_loading.py +++ b/tests/test_models/model_loading.py @@ -94,8 +94,9 @@ def parse_args(): all_model_names = args.model_name elif args.model_name_file: all_model_names = [] - if Path(args.model_name_file).exists(): - with args.model_name_file.open() as f: + model_name_file = Path(args.model_name_file) + if model_name_file.exists(): + with model_name_file.open() as f: all_model_names = f.read().strip().split() else: logging.warning(