LiftRsNumber.py
From Genome Analysis Wiki
Jump to navigationJump to searchThe printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
#!/usr/bin/python
import sys, os
def usage():
print("%s file: lift over rs number. " % sys.argv[0] )
print("file should look like:")
print("11111")
print("11112")
print("...")
def myopen(fn):
import gzip
try:
h = gzip.open(fn)
ln = h.read(2) # read arbitrary bytes so check if @param fn is a gzipped file
except:
# cannot read in gzip format
return open(fn)
h.close()
return gzip.open(fn)
RS_HISTORY = set() # store rs
RS_MERGE = dict() # high_rs -> (lower_rs, current_rs)
if __name__ == '__main__':
if len(sys.argv) != 2:
usage()
sys.exit(1)
# record obsolete rs number
for ln in myopen('/net/wonderland/home/zhanxw/mydata/dbSNP/SNPHistory.bcp.gz'):
fd = ln.strip().split('\t')
if ln.find('re-activ') < 0:
RS_HISTORY.add(fd[0])
# record rs number merge history
for ln in myopen('/net/wonderland/home/zhanxw/mydata/dbSNP/RsMergeArch.bcp.gz'):
fd = ln.strip().split('\t')
h, l = fd[0], fd[1]
c = fd[6]
# print 'c=', c
RS_MERGE[h] = (l, c)
# # find some retracted SNPs for fun
# for h, v in RS_MERGE.iteritems():
# l, c = v
# if c in RS_HISTORY:
# print h, '->', l
import re
rsPattern = re.compile(r'[0-9]*')
for ln in myopen(sys.argv[1]):
rs = ln.strip()
if not rsPattern.match(rs):
print 'ERROR: rs number should be like "1000"'
sys.exit(2)
# rs number not appear in RS_MERGE -> there is no merge on this rs
if rs not in RS_MERGE:
print "unchanged\t", rs
continue
# lift rs number
while True:
if rs in RS_MERGE:
rsLow, rsCurrent = RS_MERGE[rs]
if rsCurrent not in RS_HISTORY:
print "lifted\t", rsCurrent
break
else:
rs = rsLow
else:
print "unlifted\t", rs
break