Source code for pycurator.collectors.term_collectors.dryad

"""
Module for collecting data from Dryad repository.
"""

from collections.abc import Collection
from typing import Any, Optional, Union

import pandas as pd

from ..base import (
    BaseCollector,
    BaseTermCollector,
)
from ..utils.validating import validate_metadata_parameters
from pycurator._typing import SearchTerm, TermResultDict


[docs]class DryadCollector(BaseTermCollector): """DataDryad collector for search term queries. Parameters ---------- search_terms : list-like, optional Terms to search over. Can be (re)set via set_search_terms() or passed in directly to search functions. credentials : str, optional (default=None) JSON filepath containing credentials in form {repository_name}: {key}. """ accepts_credentials: bool = True base_url: str = "https://datadryad.org/api/v2" merge_on: str = "version"
[docs] def __init__( self, search_terms: Optional[Collection[SearchTerm]] = None, credentials: Optional[bool] = None, ) -> None: super().__init__("dryad", search_terms=search_terms, credentials=credentials)
@BaseCollector.track_indeterminate_progress def _conduct_search_over_pages( self, search_url: str, search_params: Any, print_progress: bool = False, delim: Optional[str] = None, ) -> pd.DataFrame: """Query records from the Dryad API for given parameters. Parameters ---------- search_url : str search_params : dict Contains parameters to pass to requests.get({params}). Most common include search term 'q', and page index 'page'. For full details, see the Notes. print_progress : bool, optional (default=False) If True, updates on query page progress is sent to object queue to be displayed in UI window. delim : bool, optional (default=None) Key to grab results from query response JSON. If None, entire JSON return is considered as the data results. Returns ------- search_df : pandas.DataFrame Notes ----- Dryad allows the following parameters when querying the noted record type. Datasets: page : int, optional Page to search over. per_page : int, optional Number of results per page. q : str, optional Term to query for. affiliation : str, optional ROR identifier to require in a dataset's authors. tenant : str, optional Tenant organization in Dryad. Ignored if affiliation given. modifiedSince : str, optional An ISO 8601 UTC timestamp for limiting results. Files of a dataset version: id : int Version ID of the dataset. page : int, optional As above. per_page : int, optional As above. When searching for file metadata, only id, as described above, is allowed. """ search_df = pd.DataFrame() if print_progress: self._update_query_ref( search_term=search_params["q"], page=search_params["page"] ) _, output = self.get_request_output(url=search_url, params=search_params) while output.get("count"): output = output["_embedded"] if delim: output = output[delim] output_df = pd.DataFrame(output) output_df["page"] = search_params["page"] search_df = pd.concat([search_df, output_df]).reset_index(drop=True) search_params["page"] += 1 if print_progress: self._update_query_ref( search_term=search_params["q"], page=search_params["page"] ) _, output = self.get_request_output(url=search_url, params=search_params) return search_df @BaseTermCollector.validate_search_term def get_individual_search_output(self, search_term: SearchTerm) -> pd.DataFrame: """Returns information about all datasets from DataDryad. Parameters ---------- search_term : str Returns ------- search_df : pandas.DataFrame Raises ------ TypeError Incorrect search_term type. """ search_url = f"{self.base_url}/search" search_params = {"q": search_term, "page": 1, "per_page": 100} search_df = self._conduct_search_over_pages( search_url=search_url, search_params=search_params, print_progress=True, delim="stash:datasets", ) # Add dataset-specific version id for metadata querying search_df["version"] = self._extract_version_ids(search_df) return search_df def get_query_metadata( self, object_paths: Union[str, Collection[str]], **kwargs: Any ) -> pd.DataFrame: """Retrieves the metadata for the object_paths objects. Parameters ---------- object_paths : str or collection of str Returns ------- metadata_df : pandas.DataFrame Raises ------ TypeError If no object paths are provided. """ object_paths = validate_metadata_parameters(object_paths) start_page = 1 metadata_df = pd.DataFrame() for object_path in self.track_determinate_progress(object_paths): search_url = f"{self.base_url}/versions/{object_path}/files" search_params = {"page": start_page} object_df = self._conduct_search_over_pages( search_url=search_url, search_params=search_params, delim="stash:files", print_progress=False, ) object_df["version"] = object_path object_df.loc[:, "page"] = search_params["page"] metadata_df = pd.concat([metadata_df, object_df]).reset_index(drop=True) return metadata_df @staticmethod def _extract_version_ids(version_df: pd.DataFrame) -> pd.Series: """Retrieve ids from DataFrame entries.""" return version_df["_links"].where( version_df["_links"].isna(), version_df["_links"].apply(lambda entry: entry.get("stash:version", {}).get("href", "").split("/")[-1]), ) def get_all_metadata(self, search_dict: TermResultDict) -> TermResultDict: """Retrieves metadata for records contained in input DataFrames. Parameters ---------- search_dict : dict Dictionary of DataFrames from get_all_search_outputs. Returns ------- metadata_dict : dict """ object_path_dict = { query: query_df["version"] for query, query_df in search_dict.items() } return self._get_metadata_from_paths(object_path_dict)