add pubmed

aistairc · Aug 14, 2020 · 62f35dc · 62f35dc
1 parent 5a71694
commit 62f35dc
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -108,29 +108,44 @@ sh run.sh eval [task] gold dev sp
 - [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28)
 - [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz)
 
-### Predict (with raw text)
+## Predict (with raw text)
 
-1. Prepare raw text input
+- You can prepare raw text by your own, or you can get text given PubMed ID.
+
+### Prepare your own raw text
 
 - If you want to predict for your raw text using our trained model for a task ([task] = cg, pc, ge11, etc), put your raw text as the following path
 
 ```bash
 data/raw-text/[task]/PMID-*.txt
 ```
 
-2. Preprocess raw text
+### Get text from PubMed ID
+
+1. Installation
+
+```bash
+sh install.sh pubmed
+```
+
+2. Prepare data
+
+
+### Predict
+
+1. Preprocess raw text
 
 - Tokenize raw text and prepare data for prediction
 ```bash
 sh preprocess.sh raw
 ```
 
-3. Predict
+2. Predict
 ```bash
 sh run.sh predict [task] raw text
 ```
 
-4. Retrieve the original offsets
+3. Retrieve the original offsets
 
 ```bash
 sh run.sh offset [task] raw text

diff --git a/install.sh b/install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+TASK=$1
+
+# pubmed_parser
+if [ "$TASK" = "pubmed" ]; then
+    echo "Install pubmed_parser and dependencies"
+    pip install git+https://github.com/titipata/pubmed_parser.git
+    pip install ebooklib beautifulsoup4 lxml
+fi
+
+echo "Done!"
+
diff --git a/pubmed/med2text.py b/pubmed/med2text.py
@@ -0,0 +1,62 @@
+"""Get text from pubmed id"""
+
+import ebooklib
+from ebooklib import epub
+from bs4 import BeautifulSoup
+import pubmed_parser as pp
+import requests
+import tempfile
+
+
+def medline2text(mlid):
+    outputs = pp.parse_xml_web(mlid, save_xml=False)
+    return outputs["title"], outputs["abstract"]
+
+
+def pubmed2text(pmid):
+    headers = {
+        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
+    response = requests.get(url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmid}/epub/", headers=headers)
+    response.raise_for_status()
+
+    with tempfile.NamedTemporaryFile() as epub_file:
+        epub_file.write(response.content)
+        epub_file.flush()
+
+        article = epub.read_epub(epub_file.name)
+
+        for item in article.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            if pmid[3:] in item.get_name():
+                markup = BeautifulSoup(item.get_content(), "lxml-xml")
+
+                title_tag = markup.find("span", class_="article-title")
+                title = title_tag.string.strip() if title_tag else "N/A"
+
+                abstract_tag = markup.find("div", class_="abstract")
+                abstract = abstract_tag.get_text().strip() if abstract_tag else "N/A"
+
+                body_tag = markup.find("div", class_="body")
+                sec_tags = body_tag and body_tag.find_all("div", class_="sec")
+                content = "\n".join(sec_tag.get_text().strip() for sec_tag in sec_tags) if sec_tags else "N/A"
+
+                return title, abstract, content
+
+
+def main():
+
+    # medline
+    title, abstract = medline2text("18483370")
+    print("Title: \n", title)
+    print("Abstract: \n", abstract)
+
+    # pubmed
+    title, abstract, content = pubmed2text("PMC441591")
+    print("Title: \n", title)
+    print("Abstract: \n", abstract)
+    print("Content: \n", content)
+
+    return
+
+
+if __name__ == '__main__':
+    main()