embeddings-benchmark · Samoed · Sep 29, 2025 · Sep 20, 2025 · Sep 20, 2025 · Sep 29, 2025
diff --git a/docs/mmteb/create_points_table.py b/docs/mmteb/create_points_table.py
@@ -6,18 +6,15 @@
 import pandas as pd
 
 
-def load_data() -> pd.DataFrame:
-    file_path = Path(__file__).parent / "points"
+def load_data(file_path: Path) -> pd.DataFrame:
     files = file_path.glob("*.jsonl")
 
     json_data = []
     for file in files:
-        with open(file) as f:
+        with file.open() as f:
             for line in f:
                 json_data.append(json.loads(line))
-
-    df = pd.DataFrame(json_data)
-    return df
+    return pd.DataFrame(json_data)
 
 
 def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None:
@@ -30,13 +27,13 @@ def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None:
     md = df.to_markdown()
     # add title
     md = f"# Points\n\n_Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions.\n\n {md}"
-    with open(file_path, "w") as f:
+    with file_path.open("w") as f:
         f.write(md)
 
 
 if __name__ == "__main__":
     file_path = Path(__file__).parent / "points"
     save_path = Path(__file__).parent / "points_table.md"
 
-    df = load_data()
+    df = load_data(file_path)
     save_to_markdown(df, save_path)
diff --git a/docs/mmteb/validate_points.py b/docs/mmteb/validate_points.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-import os
+from pathlib import Path
 from typing import Optional
 
 from jsonlines import Reader
@@ -44,32 +44,29 @@ def check_max_points(obj: JsonObject, commit_n: str):
 
 # Function to validate JSONL files in a folder
 def validate_jsonl_files(folder_path):
-    for filename in os.listdir(folder_path):
-        if filename.endswith(".jsonl"):
-            file_path = os.path.join(folder_path, filename)
-            commit_n = os.path.splitext(filename)[0]
-            with open(file_path, encoding="utf-8") as file:
+    folder_path = Path(folder_path)
+    for file_path in folder_path.glob("*.jsonl"):
+        commit_n = file_path.stem
+        with file_path.open(encoding="utf-8") as file:
+            try:
+                # Read JSONL file
+                reader = Reader(file)
+            except Exception:
+                raise Exception("Error reading file:", file_path)
+            for line in reader:
                 try:
-                    # Read JSONL file
-                    reader = Reader(file)
-                except Exception:
-                    raise Exception("Error reading file:", file_path)
-                for line in reader:
-                    try:
-                        # Validate JSON object against schema
-                        x = JsonObject(**line)
-                        logging.debug(x)
-                        check_max_points(x, commit_n)
+                    # Validate JSON object against schema
+                    x = JsonObject(**line)
+                    logging.debug(x)
+                    check_max_points(x, commit_n)
 
-                    except ValidationError as e:
-                        raise Exception(
-                            "Validation Error in file:", file_path, line
-                        ) from e
+                except ValidationError as e:
+                    raise Exception("Validation Error in file:", file_path, line) from e
 
 
 # Main function
 def main():
-    folder_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "points")
+    folder_path = Path(__file__).parent / "points"
     validate_jsonl_files(folder_path)
 
 

diff --git a/mteb/MTEB.py b/mteb/MTEB.py
@@ -73,7 +73,7 @@ def __init__(
             self.benchmarks = tasks
             self.tasks = list(chain.from_iterable(self.tasks))
 
-        self.err_logs_path = err_logs_path
+        self.err_logs_path = Path(err_logs_path)
         self.last_evaluated_splits = {}
 
     @property
@@ -541,7 +541,7 @@ def run(
                 logger.error(
                     f"Please check all the error logs at: {self.err_logs_path}"
                 )
-                with open(self.err_logs_path, "a") as f_out:
+                with self.err_logs_path.open("a") as f_out:
                     f_out.write(f"{datetime.now()} >>> {task.metadata.name}\n")
                     f_out.write(traceback.format_exc())
                     f_out.write("\n\n")

diff --git a/mteb/_evaluators/_download.py b/mteb/_evaluators/_download.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from pathlib import Path
+
 import requests
 import tqdm
 
@@ -9,7 +11,7 @@ def download(url: str, fname: str):
     resp = requests.get(url, stream=True)
     total = int(resp.headers.get("content-length", 0))
     with (
-        open(fname, "wb") as file,
+        Path(fname).open("wb") as file,
         tqdm.tqdm(
             desc=fname,
             total=total,

diff --git a/mteb/models/model_implementations/cache_wrapper.py b/mteb/models/model_implementations/cache_wrapper.py
@@ -114,15 +114,15 @@ def _double_vectors_file(self) -> None:
         self.vectors = new_vectors
 
     def _save_dimension(self) -> None:
-        with open(self.dimension_file, "w") as f:
+        with self.dimension_file.open("w") as f:
             f.write(str(self.vector_dim))
         logger.info(
             f"Saved vector dimension {self.vector_dim} to {self.dimension_file}"
         )
 
     def _load_dimension(self) -> None:
         if self.dimension_file.exists():
-            with open(self.dimension_file) as f:
+            with self.dimension_file.open() as f:
                 self.vector_dim = int(f.read().strip())
             logger.info(
                 f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
@@ -144,7 +144,7 @@ def save(self) -> None:
                 for hash_, index in self.hash_to_index.items()
             }
 
-            with open(self.index_file, "w", encoding="utf-8") as f:
+            with self.index_file.open("w", encoding="utf-8") as f:
                 json.dump(serializable_index, f, indent=2)
             self._save_dimension()
             logger.info(f"Saved VectorCacheMap to {self.directory}")
@@ -156,7 +156,7 @@ def load(self, name: str | None = None) -> None:
         try:
             self._load_dimension()
             if self.index_file.exists() and self.vectors_file.exists():
-                with open(self.index_file, encoding="utf-8") as f:
+                with self.index_file.open(encoding="utf-8") as f:
                     loaded_index = json.load(f)
                     self.hash_to_index = {
                         str(hash_): int(index)  # Ensure we maintain the correct types

diff --git a/mteb/models/model_implementations/evaclip_models.py b/mteb/models/model_implementations/evaclip_models.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -15,10 +16,9 @@
 
 def evaclip_loader(model_name, **kwargs):
     try:
-        import os
         import sys
 
-        sys.path.insert(0, os.path.join(os.getcwd(), "EVA/EVA-CLIP/rei"))
+        sys.path.insert(0, str(Path.cwd() / "EVA" / "EVA-CLIP" / "rei"))
 
         from eva_clip import create_model_and_transforms, get_tokenizer
     except ImportError:

diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import os
+from pathlib import Path
 
 from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
     AbsTaskAnyZeroShotClassification,
@@ -47,8 +47,8 @@ class Country211ZeroShotClassification(AbsTaskAnyZeroShotClassification):
     label_column_name: str = "cls"
 
     def get_candidate_labels(self) -> list[str]:
-        path = os.path.dirname(__file__)
-        with open(os.path.join(path, "templates/Country211_labels.txt")) as f:
+        path = Path(__file__).parent / "templates" / "Country211_labels.txt"
+        with path.open() as f:
             labels = f.readlines()
 
         return [f"a photo showing the country of {c}." for c in labels]
diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import os
+from pathlib import Path
 
 from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
     AbsTaskAnyZeroShotClassification,
@@ -52,8 +52,8 @@ class GTSRBZeroShotClassification(AbsTaskAnyZeroShotClassification):
     label_column_name: str = "cls"
 
     def get_candidate_labels(self) -> list[str]:
-        path = os.path.dirname(__file__)
-        with open(os.path.join(path, "templates/GTSRB_labels.txt")) as f:
+        path = Path(__file__).parent / "templates" / "GTSRB_labels.txt"
+        with path.open() as f:
             labels = f.readlines()
 
         return [f"a close up photo of a '{c}' traffic sign." for c in labels]
diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import os
+from pathlib import Path
 
 from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
     AbsTaskAnyZeroShotClassification,
@@ -48,8 +48,8 @@ class Imagenet1kZeroShotClassification(AbsTaskAnyZeroShotClassification):
     label_column_name: str = "cls"
 
     def get_candidate_labels(self) -> list[str]:
-        path = os.path.dirname(__file__)
-        with open(os.path.join(path, "templates/Imagenet1k_labels.txt")) as f:
+        path = Path(__file__).parent / "templates" / "Imagenet1k_labels.txt"
+        with path.open() as f:
             labels = f.readlines()
 
         return [f"a photo of {c}." for c in labels]
diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import os
+from pathlib import Path
 
 from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
     AbsTaskAnyZeroShotClassification,
@@ -60,8 +60,8 @@ class PatchCamelyonZeroShotClassification(AbsTaskAnyZeroShotClassification):
     label_column_name = "cls"
 
     def get_candidate_labels(self) -> list[str]:
-        path = os.path.dirname(__file__)
-        with open(os.path.join(path, "templates/PatchCamelyon_labels.txt")) as f:
+        path = Path(__file__).parent / "templates" / "PatchCamelyon_labels.txt"
+        with path.open() as f:
             labels = f.readlines()
 
         return [f"histopathology image of {c}" for c in labels]
diff --git a/mteb/tasks/Retrieval/eng/MLQuestions.py b/mteb/tasks/Retrieval/eng/MLQuestions.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import csv
+from pathlib import Path
 
 from huggingface_hub import snapshot_download
 
@@ -80,8 +81,9 @@ def load_data(self) -> None:
     def _load_data_for_split(self, download_dir, split):
         queries, corpus, qrels = {}, {}, {}
 
-        dataset_path = f"{download_dir}/{split}.csv"
-        with open(dataset_path) as csvfile:
+        download_dir = Path(download_dir)
+        dataset_path = download_dir / f"{split}.csv"
+        with dataset_path.open() as csvfile:
             reader = csv.DictReader(csvfile)
             for i, row in enumerate(reader):
                 query_id = f"Q{str(i)}"
@@ -91,8 +93,8 @@ def _load_data_for_split(self, download_dir, split):
                 qrels[query_id] = {f"C{doc_id}": 1}
 
         # Same corpus for all splits
-        corpus_path = f"{download_dir}/test_passages.csv"
-        with open(corpus_path) as csvfile:
+        corpus_path = download_dir / "test_passages.csv"
+        with corpus_path.open() as csvfile:
             reader = csv.DictReader(csvfile)
             for i, row in enumerate(reader):
                 doc_id = f"C{str(i)}"

diff --git a/pyproject.toml b/pyproject.toml
@@ -167,7 +167,6 @@ namespaces = false
 "mteb.tasks.Image.ZeroShotClassification.eng.templates" = ["*.txt"]
 
 [tool.ruff]
-
 target-version = "py39"
 
 
@@ -222,6 +221,7 @@ select = [
     "RUF100",  # unused-noqa
     "RUF101",  # redirected-noqa
     "RUF200",  # invalid-pyproject-toml
+    "PTH",     # use pathlib
 ]
 
 ignore = [
@@ -241,6 +241,7 @@ ignore = [
 ]
 
 [tool.ruff.lint.per-file-ignores]
+"scripts/*" = ["PTH"]
 "tests/*" = ["RUF012"]
 
 [tool.ruff.lint.flake8-implicit-str-concat]

diff --git a/tests/test_models/model_loading.py b/tests/test_models/model_loading.py
@@ -94,8 +94,9 @@ def parse_args():
         all_model_names = args.model_name
     elif args.model_name_file:
         all_model_names = []
-        if Path(args.model_name_file).exists():
-            with open(args.model_name_file) as f:
+        model_name_file = Path(args.model_name_file)
+        if model_name_file.exists():
+            with model_name_file.open() as f:
                 all_model_names = f.read().strip().split()
         else:
             logging.warning(