Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to allow bulk importing jats zip files. #89

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
18 changes: 18 additions & 0 deletions admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,29 @@ class CSVImportArticleAdmin(utils_admin_utils.ArticleFKModelAdmin):
date_hierarchy = ('imported')


class NotificationAdmin(admin.ModelAdmin):
list_display = ('pk', 'email')


class CitationFormatAdmin(admin.ModelAdmin):
list_display = ('journal', 'format')
raw_id_fields = ('journal',)


class SectionMapAdmin(admin.ModelAdmin):
list_display = ('section', 'article_type')
search_fields = ('article_type', 'section__name')
raw_id_fields = ('section',)


for pair in [
(models.ExportFile, ExportFileAdmin),
(models.CSVImport, CSVImportAdmin),
(models.CSVImportCreateArticle, CSVImportArticleAdmin),
(models.CSVImportUpdateArticle, CSVImportArticleAdmin),
(models.OJSFile,),
(models.AutomatedImportNotification, NotificationAdmin),
(models.CitationFormat, CitationFormatAdmin),
(models.SectionMap, SectionMapAdmin),
]:
admin.site.register(*pair)
111 changes: 88 additions & 23 deletions jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from django.db import transaction
from django.utils import timezone

from plugins.imports import models
from core import files
from core import models as core_models
from core.models import Account
Expand All @@ -42,6 +43,7 @@ def import_jats_article(
jats_contents, journal=None,
persist=True, filename=None, owner=None,
images=None, request=None, stage=None,
get_section_from_subject=False,
):
""" JATS import entrypoint
:param jats_contents: (str) the JATS XML to be imported
Expand All @@ -63,13 +65,14 @@ def import_jats_article(
meta["abstract"] = get_jats_abstract(metadata_soup)
meta["issue"], meta["volume"] = get_jats_issue(jats_soup)
meta["keywords"] = get_jats_keywords(metadata_soup)
meta["section_name"] = get_jats_section_name(jats_soup)
meta["section_name"] = get_jats_section_name(jats_soup, get_section_from_subject)
meta["date_published"] = get_jats_pub_date(jats_soup) or datetime.date.today()
meta["license_url"], meta["license_text"] = get_jats_license(jats_soup)
meta["rights"] = get_jats_rights_statement(jats_soup)
meta["authors"] = []
meta["date_submitted"] = None
meta["date_accepted"] = None
meta["custom_how_to_cite"] = get_custom_how_to_cite(metadata_soup)
try:
meta["first_page"] = int(metadata_soup.find("fpage").text)
except (ValueError, AttributeError):
Expand All @@ -85,7 +88,7 @@ def import_jats_article(
meta["date_accepted"] = get_jats_acc_date(history_soup)

authors_soup = metadata_soup.find("contrib-group")
author_notes = metadata_soup.find("author_notes")
author_notes = metadata_soup.find("author-notes")
if authors_soup:
meta["authors"] = get_jats_authors(
authors_soup,
Expand Down Expand Up @@ -115,7 +118,10 @@ def import_jats_article(
return article


def import_jats_zipped(zip_file, journal=None, owner=None, persist=True, stage=None):
def import_jats_zipped(
zip_file, journal=None, owner=None,
persist=True, stage=None, get_section_from_subject=False,
):
""" Import a batch of Zipped JATS articles and their associated files
:param zip_file: The zipped jats to be imported
:param journal: Journal in which to import the articles
Expand Down Expand Up @@ -147,7 +153,6 @@ def import_jats_zipped(zip_file, journal=None, owner=None, persist=True, stage=N
else:
supplements.append(file_path)


if jats_path:
# Check nested dirs relative to xml like ./figures
for dir_ in dirs:
Expand All @@ -163,14 +168,15 @@ def import_jats_zipped(zip_file, journal=None, owner=None, persist=True, stage=N
jats_file.read(), journal, persist,
jats_filename, owner, supplements,
stage=stage,
get_section_from_subject=get_section_from_subject,
)
articles.append((jats_filename, article))
if pdf_path:
import_pdf(article, pdf_path, pdf_filename)
except Exception as err:
logger.warning(err)
logger.warning(traceback.format_exc())
errors.append((filenames, err))
errors.append((jats_path, err))

return articles, errors

Expand Down Expand Up @@ -210,7 +216,7 @@ def get_jats_title(soup):
def get_jats_abstract(soup):
abstract = soup.find("abstract")
if abstract:
return abstract.text
return f"<p>{abstract.text}</p>"
else:
return ""

Expand Down Expand Up @@ -278,16 +284,22 @@ def get_jats_acc_date(soup):

def get_jats_keywords(soup):
jats_keywords_soup = soup.find("kwd-group")

# This was previously a set but is now a list to preserve keyword order.
if jats_keywords_soup:
return {
return [
keyword.text.strip()
for keyword in jats_keywords_soup.find_all("kwd")
}
]
else:
return set()
return list()


def get_jats_section_name(soup):
def get_jats_section_name(soup,get_section_from_subject):
if get_section_from_subject:
subject = soup.find("subject")
if subject:
return subject.text
return soup.find("article").attrs.get("article-type")


Expand Down Expand Up @@ -334,10 +346,44 @@ def get_jats_authors(soup, metadata_soup, author_notes=None):
corresp_email = author_notes.find("email")
if corresp_email:
author_data["email"] = corresp_email.text
else:
# Check and alternative route for identifying corresp
# authors
corresp_ref = author.find(
'xref', {'ref-type': 'corresp'}
)
if corresp_ref:
author_data["correspondence"] = True

if author_notes:
xref_rid = corresp_ref.get('rid')
corr_note = author_notes.find(
'corresp', {'id': xref_rid}
)
if corr_note:
corresp_email = corr_note.find(
'email'
)
if corresp_email:
author_data["email"] = corresp_email.text

authors.append(author_data)
return authors


def get_custom_how_to_cite(metadata_soup):
custom_meta_tags = metadata_soup.find_all('custom-meta')
for custom_meta_tag in custom_meta_tags:
meta_name_tag = custom_meta_tag.find('meta-name')
if meta_name_tag:
meta_name_value = meta_name_tag.string
if meta_name_value in ['How to cite', 'How To Cite']:
meta_value_tag = custom_meta_tag.find('meta-value')
if meta_value_tag:
return meta_value_tag.text
return ''


def get_orcid(author_soup):
contrib_ids = author_soup.findAll('contrib-id')
for ci in contrib_ids:
Expand Down Expand Up @@ -372,12 +418,19 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
journal = get_lost_found_journal()

with transaction.atomic():
section, _ = submission_models.Section.objects \
.get_or_create(
journal=journal,
name=metadata["section_name"],
)
section.save()
try:
section_map = models.SectionMap.objects.get(
article_type=metadata["section_name"],
section__journal=journal,
)
if section_map:
section = section_map.section
except models.SectionMap.DoesNotExist:
section, _ = submission_models.Section.objects \
.get_or_create(
journal=journal,
name=metadata["section_name"],
)

article = get_article(metadata.get("identifiers", {}), journal)
if not article:
Expand All @@ -386,14 +439,16 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
title=metadata["title"],
abstract=metadata["abstract"],
date_published=metadata["date_published"],
date_accepted=metadata["date_submitted"],
date_accepted=metadata["date_accepted"],
date_submitted=metadata["date_submitted"],
rights=metadata["rights"],
stage=stage or submission_models.STAGE_PUBLISHED,
is_import=True,
owner=owner,
first_page=metadata["first_page"],
last_page=metadata["last_page"]
last_page=metadata["last_page"],
custom_how_to_cite=metadata['custom_how_to_cite'],
article_agreement='This article is a JATS import.',
)
article.section = section
article.save()
Expand All @@ -403,11 +458,13 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
article.abstract = metadata["abstract"]
article.date_published = metadata["date_published"]
article.date_published = metadata["date_published"]
article.date_accepted = metadata["date_submitted"]
article.date_accepted = metadata["date_accepted"]
article.date_submitted = metadata["date_submitted"]
article.rights = metadata["rights"]
article.first_page = metadata["first_page"]
article.last_page = metadata["last_page"]
article.custom_how_to_cite = metadata["custom_how_to_cite"]
article.section = section
article.save()

if metadata["identifiers"]["doi"]:
Expand Down Expand Up @@ -444,7 +501,6 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
"orcid": author["orcid"],
},
)

fa = submission_models.FrozenAuthor.objects.create(
article=article,
author=account,
Expand All @@ -454,6 +510,7 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
frozen_orcid=author["orcid"],
frozen_email=author['email'],
order=idx,
display_email=True if author['correspondence'] else False,
)
if account and author["correspondence"]:
article.correspondence_author = account
Expand Down Expand Up @@ -491,6 +548,13 @@ def save_article(metadata, journal=None, issue=None, owner=None, stage=None):
journal=journal,
defaults={"issue_type": issue_type}
)
journal_models.SectionOrdering.objects.update_or_create(
issue=issue,
section=section,
defaults={
"order": 0,
}
)
issue.articles.add(article)
article.primary_issue = issue
article.save()
Expand Down Expand Up @@ -701,7 +765,7 @@ def import_jats_preprint(
meta["license_url"], meta["license_text"] = get_jats_license(jats_soup)
meta["authors"] = []
authors_soup = metadata_soup.find("contrib-group")
author_notes = metadata_soup.find("author_notes")
author_notes = jats_soup.find("author-notes")
if authors_soup:
meta["authors"] = get_jats_authors(
authors_soup,
Expand Down Expand Up @@ -837,7 +901,7 @@ def save_preprint(
return preprint


def import_html_reviews(preprint, review_files, owner):
def import_html_reviews(preprint, review_files, owner, number=None):
review_round, _ = review_models.ReviewRound.objects.get_or_create(
round_number=1,
article=preprint.article,
Expand All @@ -846,6 +910,7 @@ def import_html_reviews(preprint, review_files, owner):
journal=preprint.article.journal,
).first()
for review_file in review_files:
print(f"Importing {review_file}")
with open(review_file, 'r') as r_file:
contents = r_file.read()
try:
Expand Down Expand Up @@ -878,7 +943,7 @@ def import_html_reviews(preprint, review_files, owner):
assignment=review_assignment,
original_element=default_element,
defaults={
'answer': contents.strip().replace('\n', ''),
'answer': answer,
'author_can_see': True,
}
)
Expand Down
63 changes: 63 additions & 0 deletions management/commands/fetch_crossref_how_to_cite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from django.core.management.base import BaseCommand

import requests
import time
from pprint import pprint

from submission import models


class Command(BaseCommand):
"""For a give journal query crossref for the given citation format
and update the custom how to cite field."""

help = "Gets custom how to cite using Crossref."

def add_arguments(self, parser):
parser.add_argument('--journal', type=str)
parser.add_argument('--article_id', type=str)
parser.add_argument('--style', type=str)
parser.add_argument('--locale', type=str)
parser.add_argument('--mailto', type=str)

def handle(self, *args, **options):
errors = []
articles = models.Article.objects.filter(journal__code=options.get('journal'))

article_id = options.get('article_id')
if article_id:
articles = articles.filter(pk=article_id)

for index, article in enumerate(articles):
print(f"Getting how to cite for article #{article.pk}. {index}/{articles.count()}")
if article.get_doi():
try:
r = requests.get(
headers={
'Accept': 'text/bibliography',
'style': options.get('style'),
'locale': options.get('locale')
},
url=f"https://api.crossref.org/v1/works/{article.get_doi()}/transform?mailto={options.get('mailto')}",
)
r.encoding = 'UTF-8'
how_to_cite = r.text.strip()
if r.status_code == 200:
print(f"Response: {how_to_cite}")
article.custom_how_to_cite = how_to_cite
article.save()
print(f"Article #{article.pk} how to cite updated.")
else:
print(f"Crossref API responded with: {r.status_code}")
except Exception as e:
errors.append(
{'article': article, 'error': e}
)
else:
print(f"Article #{article.pk} does not have a DOI")

time.sleep(2)

print('Errors:')
pprint(errors)

Loading