Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions docs/mmteb/create_points_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,15 @@
import pandas as pd


def load_data() -> pd.DataFrame:
file_path = Path(__file__).parent / "points"
def load_data(file_path: Path) -> pd.DataFrame:
files = file_path.glob("*.jsonl")

json_data = []
for file in files:
with open(file) as f:
with file.open() as f:
for line in f:
json_data.append(json.loads(line))

df = pd.DataFrame(json_data)
return df
return pd.DataFrame(json_data)


def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None:
Expand All @@ -30,13 +27,13 @@ def save_to_markdown(df: pd.DataFrame, file_path: Path) -> None:
md = df.to_markdown()
# add title
md = f"# Points\n\n_Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions.\n\n {md}"
with open(file_path, "w") as f:
with file_path.open("w") as f:
f.write(md)


if __name__ == "__main__":
file_path = Path(__file__).parent / "points"
save_path = Path(__file__).parent / "points_table.md"

df = load_data()
df = load_data(file_path)
save_to_markdown(df, save_path)
39 changes: 18 additions & 21 deletions docs/mmteb/validate_points.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
import os
from pathlib import Path
from typing import Optional

from jsonlines import Reader
Expand Down Expand Up @@ -44,32 +44,29 @@ def check_max_points(obj: JsonObject, commit_n: str):

# Function to validate JSONL files in a folder
def validate_jsonl_files(folder_path):
for filename in os.listdir(folder_path):
if filename.endswith(".jsonl"):
file_path = os.path.join(folder_path, filename)
commit_n = os.path.splitext(filename)[0]
with open(file_path, encoding="utf-8") as file:
folder_path = Path(folder_path)
for file_path in folder_path.glob("*.jsonl"):
commit_n = file_path.stem
with file_path.open(encoding="utf-8") as file:
try:
# Read JSONL file
reader = Reader(file)
except Exception:
raise Exception("Error reading file:", file_path)
for line in reader:
try:
# Read JSONL file
reader = Reader(file)
except Exception:
raise Exception("Error reading file:", file_path)
for line in reader:
try:
# Validate JSON object against schema
x = JsonObject(**line)
logging.debug(x)
check_max_points(x, commit_n)
# Validate JSON object against schema
x = JsonObject(**line)
logging.debug(x)
check_max_points(x, commit_n)

except ValidationError as e:
raise Exception(
"Validation Error in file:", file_path, line
) from e
except ValidationError as e:
raise Exception("Validation Error in file:", file_path, line) from e


# Main function
def main():
folder_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "points")
folder_path = Path(__file__).parent / "points"
validate_jsonl_files(folder_path)


Expand Down
4 changes: 2 additions & 2 deletions mteb/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __init__(
self.benchmarks = tasks
self.tasks = list(chain.from_iterable(self.tasks))

self.err_logs_path = err_logs_path
self.err_logs_path = Path(err_logs_path)
self.last_evaluated_splits = {}

@property
Expand Down Expand Up @@ -541,7 +541,7 @@ def run(
logger.error(
f"Please check all the error logs at: {self.err_logs_path}"
)
with open(self.err_logs_path, "a") as f_out:
with self.err_logs_path.open("a") as f_out:
f_out.write(f"{datetime.now()} >>> {task.metadata.name}\n")
f_out.write(traceback.format_exc())
f_out.write("\n\n")
Expand Down
4 changes: 3 additions & 1 deletion mteb/_evaluators/_download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from pathlib import Path

import requests
import tqdm

Expand All @@ -9,7 +11,7 @@ def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get("content-length", 0))
with (
open(fname, "wb") as file,
Path(fname).open("wb") as file,
tqdm.tqdm(
desc=fname,
total=total,
Expand Down
8 changes: 4 additions & 4 deletions mteb/models/model_implementations/cache_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,15 @@ def _double_vectors_file(self) -> None:
self.vectors = new_vectors

def _save_dimension(self) -> None:
with open(self.dimension_file, "w") as f:
with self.dimension_file.open("w") as f:
f.write(str(self.vector_dim))
logger.info(
f"Saved vector dimension {self.vector_dim} to {self.dimension_file}"
)

def _load_dimension(self) -> None:
if self.dimension_file.exists():
with open(self.dimension_file) as f:
with self.dimension_file.open() as f:
self.vector_dim = int(f.read().strip())
logger.info(
f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
Expand All @@ -144,7 +144,7 @@ def save(self) -> None:
for hash_, index in self.hash_to_index.items()
}

with open(self.index_file, "w", encoding="utf-8") as f:
with self.index_file.open("w", encoding="utf-8") as f:
json.dump(serializable_index, f, indent=2)
self._save_dimension()
logger.info(f"Saved VectorCacheMap to {self.directory}")
Expand All @@ -156,7 +156,7 @@ def load(self, name: str | None = None) -> None:
try:
self._load_dimension()
if self.index_file.exists() and self.vectors_file.exists():
with open(self.index_file, encoding="utf-8") as f:
with self.index_file.open(encoding="utf-8") as f:
loaded_index = json.load(f)
self.hash_to_index = {
str(hash_): int(index) # Ensure we maintain the correct types
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/model_implementations/evaclip_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from pathlib import Path
from typing import Any

import torch
Expand All @@ -15,10 +16,9 @@

def evaclip_loader(model_name, **kwargs):
try:
import os
import sys

sys.path.insert(0, os.path.join(os.getcwd(), "EVA/EVA-CLIP/rei"))
sys.path.insert(0, str(Path.cwd() / "EVA" / "EVA-CLIP" / "rei"))

from eva_clip import create_model_and_transforms, get_tokenizer
except ImportError:
Expand Down
6 changes: 3 additions & 3 deletions mteb/tasks/Image/ZeroShotClassification/eng/Country211.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

import os
from pathlib import Path

from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
AbsTaskAnyZeroShotClassification,
Expand Down Expand Up @@ -47,8 +47,8 @@ class Country211ZeroShotClassification(AbsTaskAnyZeroShotClassification):
label_column_name: str = "cls"

def get_candidate_labels(self) -> list[str]:
path = os.path.dirname(__file__)
with open(os.path.join(path, "templates/Country211_labels.txt")) as f:
path = Path(__file__).parent / "templates" / "Country211_labels.txt"
with path.open() as f:
labels = f.readlines()

return [f"a photo showing the country of {c}." for c in labels]
6 changes: 3 additions & 3 deletions mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

import os
from pathlib import Path

from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
AbsTaskAnyZeroShotClassification,
Expand Down Expand Up @@ -52,8 +52,8 @@ class GTSRBZeroShotClassification(AbsTaskAnyZeroShotClassification):
label_column_name: str = "cls"

def get_candidate_labels(self) -> list[str]:
path = os.path.dirname(__file__)
with open(os.path.join(path, "templates/GTSRB_labels.txt")) as f:
path = Path(__file__).parent / "templates" / "GTSRB_labels.txt"
with path.open() as f:
labels = f.readlines()

return [f"a close up photo of a '{c}' traffic sign." for c in labels]
6 changes: 3 additions & 3 deletions mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

import os
from pathlib import Path

from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
AbsTaskAnyZeroShotClassification,
Expand Down Expand Up @@ -48,8 +48,8 @@ class Imagenet1kZeroShotClassification(AbsTaskAnyZeroShotClassification):
label_column_name: str = "cls"

def get_candidate_labels(self) -> list[str]:
path = os.path.dirname(__file__)
with open(os.path.join(path, "templates/Imagenet1k_labels.txt")) as f:
path = Path(__file__).parent / "templates" / "Imagenet1k_labels.txt"
with path.open() as f:
labels = f.readlines()

return [f"a photo of {c}." for c in labels]
6 changes: 3 additions & 3 deletions mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

import os
from pathlib import Path

from mteb.abstasks.AbsTaskAnyZeroShotClassification import (
AbsTaskAnyZeroShotClassification,
Expand Down Expand Up @@ -60,8 +60,8 @@ class PatchCamelyonZeroShotClassification(AbsTaskAnyZeroShotClassification):
label_column_name = "cls"

def get_candidate_labels(self) -> list[str]:
path = os.path.dirname(__file__)
with open(os.path.join(path, "templates/PatchCamelyon_labels.txt")) as f:
path = Path(__file__).parent / "templates" / "PatchCamelyon_labels.txt"
with path.open() as f:
labels = f.readlines()

return [f"histopathology image of {c}" for c in labels]
10 changes: 6 additions & 4 deletions mteb/tasks/Retrieval/eng/MLQuestions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import csv
from pathlib import Path

from huggingface_hub import snapshot_download

Expand Down Expand Up @@ -80,8 +81,9 @@ def load_data(self) -> None:
def _load_data_for_split(self, download_dir, split):
queries, corpus, qrels = {}, {}, {}

dataset_path = f"{download_dir}/{split}.csv"
with open(dataset_path) as csvfile:
download_dir = Path(download_dir)
dataset_path = download_dir / f"{split}.csv"
with dataset_path.open() as csvfile:
reader = csv.DictReader(csvfile)
for i, row in enumerate(reader):
query_id = f"Q{str(i)}"
Expand All @@ -91,8 +93,8 @@ def _load_data_for_split(self, download_dir, split):
qrels[query_id] = {f"C{doc_id}": 1}

# Same corpus for all splits
corpus_path = f"{download_dir}/test_passages.csv"
with open(corpus_path) as csvfile:
corpus_path = download_dir / "test_passages.csv"
with corpus_path.open() as csvfile:
reader = csv.DictReader(csvfile)
for i, row in enumerate(reader):
doc_id = f"C{str(i)}"
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ namespaces = false
"mteb.tasks.Image.ZeroShotClassification.eng.templates" = ["*.txt"]

[tool.ruff]

target-version = "py39"


Expand Down Expand Up @@ -222,6 +221,7 @@ select = [
"RUF100", # unused-noqa
"RUF101", # redirected-noqa
"RUF200", # invalid-pyproject-toml
"PTH", # use pathlib
]

ignore = [
Expand All @@ -241,6 +241,7 @@ ignore = [
]

[tool.ruff.lint.per-file-ignores]
"scripts/*" = ["PTH"]
"tests/*" = ["RUF012"]

[tool.ruff.lint.flake8-implicit-str-concat]
Expand Down
5 changes: 3 additions & 2 deletions tests/test_models/model_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ def parse_args():
all_model_names = args.model_name
elif args.model_name_file:
all_model_names = []
if Path(args.model_name_file).exists():
with open(args.model_name_file) as f:
model_name_file = Path(args.model_name_file)
if model_name_file.exists():
with model_name_file.open() as f:
all_model_names = f.read().strip().split()
else:
logging.warning(
Expand Down
Loading