"""
Module for collecting data from Zenodo repository.
"""
import warnings
from collections.abc import Collection
from typing import Any, Optional, Iterable, NoReturn
import pandas as pd
from ..base import BaseCollector, BaseTermCollector
from pycurator._typing import SearchTerm
[docs]class ZenodoCollector(BaseTermCollector):
"""Zenodo collector for search term queries.
Parameters
----------
search_terms : list-like, optional
Terms to search over. Can be (re)set via set_search_terms()
or passed in directly to search functions.
credentials : str, optional (default=None)
JSON filepath containing credentials in form
{repository_name}: {key}.
"""
base_url: str = "https://zenodo.org/api/records"
accepts_credentials: bool = True
[docs] def __init__(
self,
search_terms: Optional[Collection[SearchTerm]] = None,
credentials: Optional[bool] = None,
) -> None:
super().__init__("zenodo", search_terms=search_terms, credentials=credentials)
@BaseTermCollector.validate_search_term
@BaseCollector.track_indeterminate_progress
def get_individual_search_output(self, search_term: SearchTerm) -> pd.DataFrame:
"""Returns information about all records from Zenodo.
Parameters
----------
search_term : str
Returns
-------
search_df : pandas.DataFrame
Raises
------
TypeError
Incorrect search_term type.
Warns
-----
RuntimeWarning
Unsuccessful query response from the API.
"""
search_year = 2022
search_df = pd.DataFrame()
start_date = f"{search_year}-01-01"
end_date = f"{search_year}-12-31"
search_params = {
"q": f"{search_term} AND created:[{start_date} TO {end_date}]",
"page": 1,
"size": 1000,
}
response, output = self.get_request_output_and_update_query_ref(
url=self.base_url,
params=search_params,
search_term=search_term,
year=search_year,
page=search_params["page"],
)
# Handle any potential errors
if response.status_code != 200:
warnings.warn(
f"{response.status_code}: Returning without results.", RuntimeWarning
)
self.continue_running = False
self.terminate()
while output.get("hits").get("total"):
while response.status_code == 200 and output.get("hits").get("hits"):
output = output["hits"]["hits"]
output_df = pd.DataFrame(output)
output_df["page"] = search_params["page"]
search_df = pd.concat([search_df, output_df]).reset_index(drop=True)
search_params["page"] += 1
response, output = self.get_request_output_and_update_query_ref(
url=self.base_url,
params=search_params,
search_term=search_term,
year=search_year,
page=search_params["page"],
)
search_year -= 1
start_date = f"{search_year}-01-01"
end_date = f"{search_year}-12-31"
search_params[
"q"
] = f"{search_term} AND created:[{start_date} TO {end_date}]"
search_params["page"] = 1
response, output = self.get_request_output_and_update_query_ref(
url=self.base_url,
params=search_params,
search_term=search_term,
year=search_year,
page=search_params["page"],
)
self.num_queries = False
return search_df
def get_query_metadata(self, object_paths: Iterable[Any]) -> NoReturn:
raise NotImplementedError("Zenodo does not provide object metadata.")