Skip to content

Commit

Permalink
split full text
Browse files Browse the repository at this point in the history
  • Loading branch information
trieuhl committed Aug 17, 2020
1 parent 615cd65 commit d75e59e
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 19 deletions.
20 changes: 18 additions & 2 deletions pubmed/med2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ def pmc2text(pmid):

body_tag = markup.find("div", class_="body")
sec_tags = body_tag and body_tag.find_all("div", class_="sec")
content = "\n".join(sec_tag.get_text().strip() for sec_tag in sec_tags) if sec_tags else "N/A"
# content = "\n".join(sec_tag.get_text().strip() for sec_tag in sec_tags) if sec_tags else "N/A"

return title, abstract, content
sections_ = []

# get text
if sec_tags:
for sec_tag in sec_tags:
sec_title = sec_tag.find("h2")
if sec_title:
sec_title = sec_title.text
else:
sec_title = sec_tag.find("h3")
if sec_title:
sec_title = sec_title.text
sec_text = sec_tag.get_text().strip()

sections_.append((sec_title, sec_text))

return title, abstract, sections_
57 changes: 40 additions & 17 deletions pubmed/pubmed2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def pmids2text(pmid_path, textdir):

# get text given each PMID, write to file
for pmid in pmid_list:

print(pmid)
title, abstract = med2text.pmid2text(pmid)

Expand All @@ -47,6 +48,7 @@ def pmids2text(pmid_path, textdir):
# get text given PMCID
for pmcid in pmcid_list:
try:

print(pmcid)
title, abstract, content = med2text.pmc2text(pmcid)
if len(title) > 0:
Expand All @@ -72,6 +74,9 @@ def pmid2text(pmid, textdir):
if not os.path.exists(textdir):
os.makedirs(textdir)

if os.path.exists(textdir):
os.system('rm ' + textdir + 'PMID-' + pmid + '*')

# get text given each PMID, write to file
print(pmid)
title, abstract = med2text.pmid2text(pmid)
Expand All @@ -91,16 +96,33 @@ def pmcid2text(pmcid, textdir):
os.makedirs(textdir)

try:

if os.path.exists(textdir):
os.system('rm ' + textdir + pmcid.replace('PMC', 'PMC-') + '*')

print(pmcid)
title, abstract, content = med2text.pmc2text(pmcid)
title, abstract, sections_ = med2text.pmc2text(pmcid)

if len(title) > 0:
with open(os.path.join(textdir, ''.join(['PMC-', pmcid.replace('PMC', ''), '.txt'])), 'w') as fo:
ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-0', str(0), '-', 'TIAB', '.txt'])
with open(os.path.join(textdir, ftitle), 'w') as fo:
fo.write(title)
fo.write('\n\n')
fo.write(abstract)
fo.write('\n\n')
fo.write(content)
print('Done', pmcid)

if len(sections_) > 0:

for sec_id, sec_data in enumerate(sections_):
sec_title = sec_data[0].strip().replace(' ', '_')
sec_text = sec_data[1]
if sec_id < 9:
ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-0', str(sec_id + 1), '-', sec_title, '.txt'])
else:
ftitle = ''.join(['PMC-', pmcid.replace('PMC', ''), '-', str(sec_id + 1), '-', sec_title, '.txt'])
with open(os.path.join(textdir, ftitle), 'w') as fo:
fo.write(sec_text)

print('Done', pmcid)

except urllib3.exceptions.ProtocolError as error:
print('Protocol Error', pmcid)
Expand All @@ -112,15 +134,16 @@ def pmcid2text(pmcid, textdir):

if __name__ == '__main__':
# pmid2text('../data/my-pubmed/pmid.txt', '../data/my-pubmed/original_text/')

option = sys.argv[1]

# pubmed id list
if option == 'pmids':
pmids2text(sys.argv[2], sys.argv[3])

elif option == 'pmid':
pmid2text(sys.argv[2], sys.argv[3])

elif option == 'pmcid':
pmcid2text(sys.argv[2], sys.argv[3])
pmcid2text('PMC4353630', '../data/my-pubmed/original_text/')

# option = sys.argv[1]
#
# # pubmed id list
# if option == 'pmids':
# pmids2text(sys.argv[2], sys.argv[3])
#
# elif option == 'pmid':
# pmid2text(sys.argv[2], sys.argv[3])
#
# elif option == 'pmcid':
# pmcid2text(sys.argv[2], sys.argv[3])

0 comments on commit d75e59e

Please sign in to comment.