From d75e59ee1237445f4209fc123bd5238a2aa29bb5 Mon Sep 17 00:00:00 2001 From: Long Trieu Date: Mon, 17 Aug 2020 22:01:26 +0900 Subject: [PATCH] split full text --- pubmed/med2text.py | 20 +++++++++++++-- pubmed/pubmed2text.py | 57 ++++++++++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/pubmed/med2text.py b/pubmed/med2text.py index 607aef2..9d82788 100644 --- a/pubmed/med2text.py +++ b/pubmed/med2text.py @@ -37,6 +37,22 @@ def pmc2text(pmid): body_tag = markup.find("div", class_="body") sec_tags = body_tag and body_tag.find_all("div", class_="sec") - content = "\n".join(sec_tag.get_text().strip() for sec_tag in sec_tags) if sec_tags else "N/A" + # content = "\n".join(sec_tag.get_text().strip() for sec_tag in sec_tags) if sec_tags else "N/A" - return title, abstract, content + sections_ = [] + + # get text + if sec_tags: + for sec_tag in sec_tags: + sec_title = sec_tag.find("h2") + if sec_title: + sec_title = sec_title.text + else: + sec_title = sec_tag.find("h3") + if sec_title: + sec_title = sec_title.text + sec_text = sec_tag.get_text().strip() + + sections_.append((sec_title, sec_text)) + + return title, abstract, sections_ diff --git a/pubmed/pubmed2text.py b/pubmed/pubmed2text.py index 4f4514b..796bc92 100644 --- a/pubmed/pubmed2text.py +++ b/pubmed/pubmed2text.py @@ -34,6 +34,7 @@ def pmids2text(pmid_path, textdir): # get text given each PMID, write to file for pmid in pmid_list: + print(pmid) title, abstract = med2text.pmid2text(pmid) @@ -47,6 +48,7 @@ def pmids2text(pmid_path, textdir): # get text given PMCID for pmcid in pmcid_list: try: + print(pmcid) title, abstract, content = med2text.pmc2text(pmcid) if len(title) > 0: @@ -72,6 +74,9 @@ def pmid2text(pmid, textdir): if not os.path.exists(textdir): os.makedirs(textdir) + if os.path.exists(textdir): + os.system('rm ' + textdir + 'PMID-' + pmid + '*') + # get text given each PMID, write to file print(pmid) title, abstract = med2text.pmid2text(pmid) @@ -91,16 +96,33 @@ def pmcid2text(pmcid, textdir): os.makedirs(textdir) try: + + if os.path.exists(textdir): + os.system('rm ' + textdir + pmcid.replace('PMC', 'PMC-') + '*') + print(pmcid) - title, abstract, content = med2text.pmc2text(pmcid) + title, abstract, sections_ = med2text.pmc2text(pmcid) + if len(title) > 0: - with open(os.path.join(textdir, ''.join(['PMC-', pmcid.replace('PMC', ''), '.txt'])), 'w') as fo: + ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-0', str(0), '-', 'TIAB', '.txt']) + with open(os.path.join(textdir, ftitle), 'w') as fo: fo.write(title) fo.write('\n\n') fo.write(abstract) - fo.write('\n\n') - fo.write(content) - print('Done', pmcid) + + if len(sections_) > 0: + + for sec_id, sec_data in enumerate(sections_): + sec_title = sec_data[0].strip().replace(' ', '_') + sec_text = sec_data[1] + if sec_id < 9: + ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-0', str(sec_id + 1), '-', sec_title, '.txt']) + else: + ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-', str(sec_id + 1), '-', sec_title, '.txt']) + with open(os.path.join(textdir, ftitle), 'w') as fo: + fo.write(sec_text) + + print('Done', pmcid) except urllib3.exceptions.ProtocolError as error: print('Protocol Error', pmcid) @@ -112,15 +134,16 @@ def pmcid2text(pmcid, textdir): if __name__ == '__main__': # pmid2text('../data/my-pubmed/pmid.txt', '../data/my-pubmed/original_text/') - - option = sys.argv[1] - - # pubmed id list - if option == 'pmids': - pmids2text(sys.argv[2], sys.argv[3]) - - elif option == 'pmid': - pmid2text(sys.argv[2], sys.argv[3]) - - elif option == 'pmcid': - pmcid2text(sys.argv[2], sys.argv[3]) + pmcid2text('PMC4353630', '../data/my-pubmed/original_text/') + + # option = sys.argv[1] + # + # # pubmed id list + # if option == 'pmids': + # pmids2text(sys.argv[2], sys.argv[3]) + # + # elif option == 'pmid': + # pmid2text(sys.argv[2], sys.argv[3]) + # + # elif option == 'pmcid': + # pmcid2text(sys.argv[2], sys.argv[3])