Skip to content

Commit 970a0df

Browse files
authored
Custom metadata filenames (#7663)
custom metadata filenames
1 parent e71b0b1 commit 970a0df

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class FolderBasedBuilderConfig(datasets.BuilderConfig):
3131
features: Optional[datasets.Features] = None
3232
drop_labels: bool = None
3333
drop_metadata: bool = None
34+
metadata_filenames: list[str] = None
3435
filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None
3536

3637
def __post_init__(self):
@@ -76,6 +77,7 @@ def _split_generators(self, dl_manager):
7677
do_analyze = not self.config.drop_labels or not self.config.drop_metadata
7778
labels, path_depths = set(), set()
7879
metadata_files = collections.defaultdict(set)
80+
metadata_filenames = self.config.metadata_filenames or self.METADATA_FILENAMES
7981

8082
def analyze(files_or_archives, downloaded_files_or_dirs, split):
8183
if len(downloaded_files_or_dirs) == 0:
@@ -91,12 +93,12 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
9193
if not self.config.drop_labels:
9294
labels.add(os.path.basename(os.path.dirname(original_file)))
9395
path_depths.add(count_path_segments(original_file))
94-
elif os.path.basename(original_file) in self.METADATA_FILENAMES:
96+
elif os.path.basename(original_file) in metadata_filenames:
9597
metadata_files[split].add((original_file, downloaded_file))
9698
else:
9799
original_file_name = os.path.basename(original_file)
98100
logger.debug(
99-
f"The file '{original_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAMES} either."
101+
f"The file '{original_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {metadata_filenames} either."
100102
)
101103
else:
102104
archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs
@@ -108,13 +110,13 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
108110
if not self.config.drop_labels:
109111
labels.add(os.path.basename(os.path.dirname(downloaded_dir_file)))
110112
path_depths.add(count_path_segments(downloaded_dir_file))
111-
elif os.path.basename(downloaded_dir_file) in self.METADATA_FILENAMES:
113+
elif os.path.basename(downloaded_dir_file) in metadata_filenames:
112114
metadata_files[split].add((None, downloaded_dir_file))
113115
else:
114116
archive_file_name = os.path.basename(archive)
115117
original_file_name = os.path.basename(downloaded_dir_file)
116118
logger.debug(
117-
f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAMES} either."
119+
f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {metadata_filenames} either."
118120
)
119121

120122
data_files = self.config.data_files
@@ -257,11 +259,12 @@ def _set_feature(feature):
257259

258260
def _split_files_and_archives(self, data_files):
259261
files, archives = [], []
262+
metadata_filenames = self.config.metadata_filenames or self.METADATA_FILENAMES
260263
for data_file in data_files:
261264
_, data_file_ext = os.path.splitext(data_file)
262265
if data_file_ext.lower() in self.EXTENSIONS:
263266
files.append(data_file)
264-
elif os.path.basename(data_file) in self.METADATA_FILENAMES:
267+
elif os.path.basename(data_file) in metadata_filenames:
265268
files.append(data_file)
266269
else:
267270
archives.append(data_file)

0 commit comments

Comments
 (0)