#!/usr/bin/python import string import os import sys import re # functions def getSeq(file): infile = open(file, "r") seqlines = infile.readlines() seqlines = map(string.strip, seqlines) infile.close() if seqlines[0][0] == ">": seqlines = seqlines[1:] return string.join(seqlines, "") # end of functions lines = sys.stdin.readlines() lines = map(string.strip, lines) # check for no hit if lines[0] == "No match found!": print "No match found!" sys.exit(); # check that the first line is a valid line and use this to determine whether the input is ok data = lines[0].split(",") if len(data) != 12: print "Input is not correct!" sys.exit(); # transform the data into a data structure record = [] record.append(data[0]) # mRNA file record.append(data[1]) # genomic file record.append(data[2]) # strand record.append([]) # exon info for line in lines: data = line.split(",") record[3].append(data[3:12]) # each record is a array in a array, mStart, mEnd, gStart, gEnd, match, mismatch, gap, mseq, seq # read command line parameters format = "1" for i in range(len(sys.argv)): if sys.argv[i] == "-o": if i+1 < len(sys.argv): if sys.argv[i+1].isdigit() and int(sys.argv[i+1]) >= 1 and int(sys.argv[i+1]) <= 3: format = sys.argv[i+1] else: print "Invalid output format type!" sys.exit(); else: print "No output format specified!" sys.exit(); # print format 1, fasta formatted exons, each is a fasta sequence if format == "1": num = 1 for i in record[3]: if record[2] == "0": print ">Exon %s mRNA %s %s %s %s %s %s" % (num, record[0], i[0], i[1], i[4], i[5], i[6]) else: print ">Exon %s Reverse_Complemented_mRNA %s %s %s %s %s %s" % (num, record[1], i[0], i[1], i[4], o[5], i[6]) print i[7] print ">Exon %s Genomic %s %s %s %s %s %s" % (num, record[1], i[2], i[3], i[4], i[5], i[6]) print i[8] num = num + 1 # print format 2, fasta formatted exons, as two fasta sequences, with '-' in the mseq for the introns elif format == "2": # read in the seq file first seq = getSeq(record[1]) # print out mRNA header if record[2] == "0": print ">mRNA %s %s %s" % (record[0], record[3][0][0], record[3][-1][1]) else: print ">Reverse_Complemented_mRNA %s %s %s" % (record[0], record[3][0][0], record[3][-1][1]) # print mRNA sequence outline = "" for i in range(len(record[3])): if i != len(record[3])-1: outline = outline + record[3][i][7] + "-" * (int(record[3][i+1][2])-int(record[3][i][3])-1) else: outline = outline + record[3][i][7] print outline # print out genomic header print ">Genomic %s %s %s" % (record[1], record[3][0][2], record[3][-1][3]) # print out genomic sequence outline = "" for i in range(len(record[3])): if i != len(record[3])-1: outline = outline + record[3][i][8] + seq[int(record[3][i][3])+1:int(record[3][i+1][2])] else: outline = outline + record[3][i][8] print outline # print format 3, fasta formatted introns, as individual fasta sequences elif format == "3": # read in the seq file first seq = getSeq(record[1]) for i in range(len(record[3])-1): # iterate through each intron # print the header if record[2] == "0": print ">Intron %s sense file:%s start:%s end:%s length:%s" % (str(i+1), record[1], str(int(record[3][i][3])+1), str(int(record[3][i+1][2])-1), str(int(record[3][i+1][2])-int(record[3][i][3])-1)) else: print ">Intron %s anti-sense file:%s start:%s end:%s length:%s" % (str(i+1), record[1], str(int(record[3][i][3])+1), str(int(record[3][i+1][2])-1), str(int(record[3][i+1][2])-int(record[3][i][3])-1)) # print the intron sequence print seq[int(record[3][i][3])+1:int(record[3][i+1][2])]