Skip to content

Commit 1ef14b3

Browse files
authored
py: Add more authorship metadata from model card (#8810)
* py: add more authorship metadata from model card * fixup! py: add more authorship metadata from model card
1 parent d3f0c71 commit 1ef14b3

File tree

1 file changed

+68
-61
lines changed

1 file changed

+68
-61
lines changed

gguf-py/gguf/metadata.py

Lines changed: 68 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -284,20 +284,67 @@ def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = No
284284
########################
285285
if model_card is not None:
286286

287-
if "model_name" in model_card and metadata.name is None:
288-
# Not part of huggingface model card standard but notice some model creator using it
289-
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
290-
metadata.name = model_card.get("model_name")
287+
def use_model_card_metadata(metadata_key: str, model_card_key: str):
288+
if model_card_key in model_card and getattr(metadata, metadata_key, None) is None:
289+
setattr(metadata, metadata_key, model_card.get(model_card_key))
291290

292-
if "model_creator" in model_card and metadata.author is None:
293-
# Not part of huggingface model card standard but notice some model creator using it
294-
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
295-
metadata.author = model_card.get("model_creator")
291+
def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
292+
# Note: Will append rather than replace if already exist
293+
tags_value = model_card.get(model_card_key, None)
294+
if tags_value is None:
295+
return
296296

297-
if "model_type" in model_card and metadata.basename is None:
298-
# Not part of huggingface model card standard but notice some model creator using it
299-
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
300-
metadata.basename = model_card.get("model_type")
297+
current_value = getattr(metadata, metadata_key, None)
298+
if current_value is None:
299+
current_value = []
300+
301+
if isinstance(tags_value, str):
302+
current_value.append(tags_value)
303+
elif isinstance(tags_value, list):
304+
current_value.extend(tags_value)
305+
306+
setattr(metadata, metadata_key, current_value)
307+
308+
# LLAMA.cpp's direct internal convention
309+
# (Definitely not part of hugging face formal/informal standard)
310+
#########################################
311+
use_model_card_metadata("name", "name")
312+
use_model_card_metadata("author", "author")
313+
use_model_card_metadata("version", "version")
314+
use_model_card_metadata("organization", "organization")
315+
use_model_card_metadata("description", "description")
316+
use_model_card_metadata("finetune", "finetune")
317+
use_model_card_metadata("basename", "basename")
318+
use_model_card_metadata("size_label", "size_label")
319+
use_model_card_metadata("source_url", "url")
320+
use_model_card_metadata("source_doi", "doi")
321+
use_model_card_metadata("source_uuid", "uuid")
322+
use_model_card_metadata("source_repo_url", "repo_url")
323+
324+
# LLAMA.cpp's huggingface style convention
325+
# (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
326+
###########################################
327+
use_model_card_metadata("name", "model_name")
328+
use_model_card_metadata("author", "model_author")
329+
use_model_card_metadata("version", "model_version")
330+
use_model_card_metadata("organization", "model_organization")
331+
use_model_card_metadata("description", "model_description")
332+
use_model_card_metadata("finetune", "model_finetune")
333+
use_model_card_metadata("basename", "model_basename")
334+
use_model_card_metadata("size_label", "model_size_label")
335+
use_model_card_metadata("source_url", "model_url")
336+
use_model_card_metadata("source_doi", "model_doi")
337+
use_model_card_metadata("source_uuid", "model_uuid")
338+
use_model_card_metadata("source_repo_url", "model_repo_url")
339+
340+
# Hugging Face Direct Convention
341+
#################################
342+
343+
# Not part of huggingface model card standard but notice some model creator using it
344+
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
345+
use_model_card_metadata("name", "model_name")
346+
use_model_card_metadata("author", "model_creator")
347+
use_model_card_metadata("basename", "model_type")
301348

302349
if "base_model" in model_card:
303350
# This represents the parent models that this is based on
@@ -329,58 +376,18 @@ def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = No
329376
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
330377
metadata.base_models.append(base_model)
331378

332-
if "license" in model_card and metadata.license is None:
333-
metadata.license = model_card.get("license")
334-
335-
if "license_name" in model_card and metadata.license_name is None:
336-
metadata.license_name = model_card.get("license_name")
337-
338-
if "license_link" in model_card and metadata.license_link is None:
339-
metadata.license_link = model_card.get("license_link")
340-
341-
tags_value = model_card.get("tags", None)
342-
if tags_value is not None:
343-
344-
if metadata.tags is None:
345-
metadata.tags = []
346-
347-
if isinstance(tags_value, str):
348-
metadata.tags.append(tags_value)
349-
elif isinstance(tags_value, list):
350-
metadata.tags.extend(tags_value)
351-
352-
pipeline_tags_value = model_card.get("pipeline_tag", None)
353-
if pipeline_tags_value is not None:
354-
355-
if metadata.tags is None:
356-
metadata.tags = []
357-
358-
if isinstance(pipeline_tags_value, str):
359-
metadata.tags.append(pipeline_tags_value)
360-
elif isinstance(pipeline_tags_value, list):
361-
metadata.tags.extend(pipeline_tags_value)
362-
363-
language_value = model_card.get("languages", model_card.get("language", None))
364-
if language_value is not None:
365-
366-
if metadata.languages is None:
367-
metadata.languages = []
368-
369-
if isinstance(language_value, str):
370-
metadata.languages.append(language_value)
371-
elif isinstance(language_value, list):
372-
metadata.languages.extend(language_value)
379+
use_model_card_metadata("license", "license")
380+
use_model_card_metadata("license_name", "license_name")
381+
use_model_card_metadata("license_link", "license_link")
373382

374-
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
375-
if dataset_value is not None:
383+
use_array_model_card_metadata("tags", "tags")
384+
use_array_model_card_metadata("tags", "pipeline_tag")
376385

377-
if metadata.datasets is None:
378-
metadata.datasets = []
386+
use_array_model_card_metadata("languages", "languages")
387+
use_array_model_card_metadata("languages", "language")
379388

380-
if isinstance(dataset_value, str):
381-
metadata.datasets.append(dataset_value)
382-
elif isinstance(dataset_value, list):
383-
metadata.datasets.extend(dataset_value)
389+
use_array_model_card_metadata("datasets", "datasets")
390+
use_array_model_card_metadata("datasets", "dataset")
384391

385392
# Hugging Face Parameter Heuristics
386393
####################################

0 commit comments

Comments
 (0)