forked from Genome3d/multimorbid3D
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_rsmerged_archive.py
executable file
·36 lines (30 loc) · 1.1 KB
/
create_rsmerged_archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#! /usr/bin/env python
import pandas as pd
import os
import sys
from sqlalchemy import create_engine
from tqdm import tqdm
merged_arch = 'data/snps/human_9606_b151_GRCh38p7_RsMergeArch.pairs.bcp.gz'
out_dir = 'data/snps/dbs/'
os.makedirs(out_dir, exist_ok=True)
db_url = f'sqlite:///{out_dir}/human_9606_b151_GRCh38p7_RsMergeArch.bcp.db'
desc = f'Building merged snps table...'
bar_format = '{desc}: {n_fmt} {unit}'
t = tqdm(total=0, unit='entries', desc=desc, disable=False,
bar_format=bar_format)
table = 'merged_arch'
chunksize = 200000
db = create_engine(db_url, echo=False)
idx = 1
for df in pd.read_csv(merged_arch, sep='\t', chunksize=chunksize,
header=None, names=['new', 'old']):
if_exists = 'replace' if t.total == 0 else 'append'
df.to_sql(table, con=db, if_exists=if_exists, index=False)
idx += len(df)
t.total += len(df)
t.update(len(df))
t.close()
db.execute('''CREATE INDEX idx_{} ON {} (new)'''.format(
'{}_{}'.format(table, 'new'), table))
db.execute('''CREATE INDEX idx_{} ON {} (old)'''.format(
'{}_{}'.format(table, 'old'), table))