@@ -31,6 +31,7 @@ class FolderBasedBuilderConfig(datasets.BuilderConfig):
31
31
features : Optional [datasets .Features ] = None
32
32
drop_labels : bool = None
33
33
drop_metadata : bool = None
34
+ metadata_filenames : list [str ] = None
34
35
filters : Optional [Union [ds .Expression , list [tuple ], list [list [tuple ]]]] = None
35
36
36
37
def __post_init__ (self ):
@@ -76,6 +77,7 @@ def _split_generators(self, dl_manager):
76
77
do_analyze = not self .config .drop_labels or not self .config .drop_metadata
77
78
labels , path_depths = set (), set ()
78
79
metadata_files = collections .defaultdict (set )
80
+ metadata_filenames = self .config .metadata_filenames or self .METADATA_FILENAMES
79
81
80
82
def analyze (files_or_archives , downloaded_files_or_dirs , split ):
81
83
if len (downloaded_files_or_dirs ) == 0 :
@@ -91,12 +93,12 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
91
93
if not self .config .drop_labels :
92
94
labels .add (os .path .basename (os .path .dirname (original_file )))
93
95
path_depths .add (count_path_segments (original_file ))
94
- elif os .path .basename (original_file ) in self . METADATA_FILENAMES :
96
+ elif os .path .basename (original_file ) in metadata_filenames :
95
97
metadata_files [split ].add ((original_file , downloaded_file ))
96
98
else :
97
99
original_file_name = os .path .basename (original_file )
98
100
logger .debug (
99
- f"The file '{ original_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { self . METADATA_FILENAMES } either."
101
+ f"The file '{ original_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { metadata_filenames } either."
100
102
)
101
103
else :
102
104
archives , downloaded_dirs = files_or_archives , downloaded_files_or_dirs
@@ -108,13 +110,13 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
108
110
if not self .config .drop_labels :
109
111
labels .add (os .path .basename (os .path .dirname (downloaded_dir_file )))
110
112
path_depths .add (count_path_segments (downloaded_dir_file ))
111
- elif os .path .basename (downloaded_dir_file ) in self . METADATA_FILENAMES :
113
+ elif os .path .basename (downloaded_dir_file ) in metadata_filenames :
112
114
metadata_files [split ].add ((None , downloaded_dir_file ))
113
115
else :
114
116
archive_file_name = os .path .basename (archive )
115
117
original_file_name = os .path .basename (downloaded_dir_file )
116
118
logger .debug (
117
- f"The file '{ original_file_name } ' from the archive '{ archive_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { self . METADATA_FILENAMES } either."
119
+ f"The file '{ original_file_name } ' from the archive '{ archive_file_name } ' was ignored: it is not a { self .BASE_COLUMN_NAME } , and is not { metadata_filenames } either."
118
120
)
119
121
120
122
data_files = self .config .data_files
@@ -257,11 +259,12 @@ def _set_feature(feature):
257
259
258
260
def _split_files_and_archives (self , data_files ):
259
261
files , archives = [], []
262
+ metadata_filenames = self .config .metadata_filenames or self .METADATA_FILENAMES
260
263
for data_file in data_files :
261
264
_ , data_file_ext = os .path .splitext (data_file )
262
265
if data_file_ext .lower () in self .EXTENSIONS :
263
266
files .append (data_file )
264
- elif os .path .basename (data_file ) in self . METADATA_FILENAMES :
267
+ elif os .path .basename (data_file ) in metadata_filenames :
265
268
files .append (data_file )
266
269
else :
267
270
archives .append (data_file )
0 commit comments