Download as pdf or txt
Download as pdf or txt
You are on page 1of 13

Suggested solutions exercises 11-23

11.py
Code

#!/usr/bin/python3

import sys

with open(sys.argv[1], 'r') as in_fh:

counter = 0

# You can test if a substring is found in a string in a similar way


# as you do a membership test for a list
if '.fastq' in sys.argv[1] or '.fq' in sys.argv[1]:

for line in in_fh:


counter += 1
print(int(counter/4))

else:

for line in in_fh:


if line.starswith('>'):
counter += 1
print(counter)

1
12.py
Code

#!/usr/bin/python3

# Exercise: Transcribe DNA fasta to RNA


# Author: Martin Basterrechea

import sys

if len(sys.argv) < 3:
print('Error: Not enough arguments. Usage: 12.py inputfile outputfile')
sys.exit()

inputname = sys.argv[1]
outputname = sys.argv[2]

with open(inputname, 'r') as inf, open(outputname, 'w') as outf:


for line in inf:
line = line.rstrip()
if line[0] != '>':
line = line.replace('t', 'u').replace('T', 'U')
print('{}'.format(line), file=outf)
else:
print('{}'.format(line), file=outf)

2
13.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in multi-fasta file
# Author: Martin Basterrechea

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 13.py inputfile')
sys.exit()

inputname = sys.argv[1]

with open(inputname, 'r') as inf:


for line in inf:
if line[0] != '>':
line = line.rstrip().lower()
# Don't use regular expressions for this, it's slower
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

3
14.py
Code

#!/usr/bin/python3
# Exercise: Calculate average length of sequences in multi-fasta file
# Author: Martin Basterrechea

import sys

if len(sys.argv) < 2: # First argument is second in the sys.argv list!


print('Error: Not enough arguments. Usage: 14.py inputfile')
sys.exit()

inputname = sys.argv[1]

# We get the sum of the sequence lengths and divide it by the number of sequences
total_length = 0
seq_count = 0

with open(inputname, 'r') as inf:


for line in inf:
line = line.rstrip()
if line[0] == '>':
seq_count += 1
else:
total_length += len(line)

print(round(total_length/seq_count,1))

4
15.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in fastq file

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 15.py fastqFile')
sys.exit()

inputname = sys.argv[1]
countLines = 0

with open(inputname, 'r') as inf:


for line in inf:
countLines += 1
if countLines % 2:
line = line.rstrip().lower()
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

5
16.py
Code

#!/usr/bin/python3

import sys
import re

# Compiling the pattern is not necessary, but faster than using the string directly
lengthPattern = re.compile('length=(\d+)')

countId = 0
countNt = 0
headerLength = 0

with open(sys.argv[1], 'r') as in_fh:

for line in in_fh:


line = line.rstrip()

# Note that there is an empty line of the input file


# This is an empty string and evaluates to False
if line and line[0] == '>':
countId += 1
matchObject = re.search(lengthPattern, line)
headerLength += int(matchObject.group(1))
else:
countNt += len(line)

print('IDs: {}'.format(countId))
print('Total counted nucl: {} nt, Average: {} nt'.format(countNt, round(countNt / countId, 1)))
print('Total label length: {} nt, Average: {} nt'.format(headerLength, round(headerLength / cou

6
17.py
Code

#!/usr/bin/python3

import sys
import re

target_id = sys.argv[1]
found_id = False

'''
# A regular expression solution
# The added \s to the pattern match a trailing space, tab or newline,
# preventing a match with only part of the ID
search_pattern = re.compile('>' + target_id + '\s')

with open(sys.argv[2], 'r') as in_fh:


for line in in_fh:
line = line.rstrip()
if re.match(search_pattern, line):
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break
'''

# A string solution, much faster for bigger files.


with open(sys.argv[2], 'r') as in_fh:
for line in in_fh:
line = line.rstrip()
# We split the line on () which means on any white space.
# We return the first element. If there was no whitespace
# the entire line will be in the first element
if '>' + target_id in line.split()[0]:
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break

if found_id is False: # This if statement is really not needed


print('The ID {} was not found'.format(target_id))

7
18.py
Code

#!/usr/bin/python3

#Usage: ./18.py input_fasta

i = 0
with open(sys.argv[1], 'r') as f:

for line in f:
i += 1
line = line.rstrip()

if '>' in line:
if i == 1:
print(line)
else:
print('\n' + line)
else:
print(line, end='')

print() # Print a final newline

8
19.py
Code

#!/usr/bin/python3
# Exercise: Transcribe DNA fasta to RNA
# Author: Martin Basterrechea

import sys

mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,


'Y' : 163.1760, 'M' : 131.1926, 'G' : 57.0519, 'H' : 137.1411,
'C' : 103.1388, 'E' : 129.1155, 'S' : 87.0782, 'F' : 147.1766,
'I' : 113.1594, 'A' : 71.0788, 'W' : 186.2132, 'N' : 114.1038,
'Q' : 128.1307, 'L' : 113.1594, 'R' : 156.1875, 'K' : 128.1741 }

prot = input('Enter an amino acid sequence: ')

total_mass = 0

for aa in prot:
# This will return the value if it exists, or None if it doesn't
mass = mass_dict.get(aa.upper())
if mass: #Any number (except 0) will be considered as true
total_mass += mass
else: #None is considered as false
print('Invalid amino acid!: {}'.format(aa))
sys.exit()
print('Total mass: ' + str(total_mass))

9
20.py
Code

#!/usr/bin/python3

#Usage: ./20_getSeq.py oligo fasta_file


#Expects single line fasta format

import sys
import re

targetOligo = sys.argv[1]
reverseOligo = targetOligo[::-1]
reverseOligo = reverseOligo.translate(str.maketrans('acgtACGT', 'tgcaTGCA'))

print('#Oligo\t{}'.format(targetOligo))
print('#id\tabundance')

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:

if line[0] == '>':
line = line.rstrip() # Save for future use
idLine = line.split()[0]
numberOfOligos = 0
else:
numberOfOligos += len(re.findall(targetOligo, line, re.I))
numberOfOligos += len(re.findall(reverseOligo, line, re.I))
if numberOfOligos > 0:
print('{}\t{}'.format(idLine, numberOfOligos))

10
21.py
Code

#!/usr/bin/env python3
#Exercise: Translate DNA into AA
#Note that this script only works with non interleaved sequences

import sys

if len(sys.argv) < 3:
print('Error, not enough arguments. Usage: 21.py input.fasta output.faa')
sys.exit()

gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',


'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',
'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',
'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'}

with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:


for line in inf:
line = line.rstrip()
if line[0] == '>':
print(line, file = outf)
else:
### Translation block
aa_seq = '' # Remember to empty the variable for each new translation
for n in range(0,len(line), 3):
codon = line[n:n+3].upper()
#False at the end if the sequence length is not a multiple of 3
if len(codon) == 3:
aa_seq += gen_code[codon]

print('{}'.format(aa_seq), file = outf)


### End of translation block

11
22.py
Code

#!/usr/bin/python3

# Runs as 22.py rRNAlistFile inFastaFile outFastaFile

import sys

# We create a set of the ids found in the rRNA list


rRNAset = set()
with open(sys.argv[1], 'r') as inrRNA:
for line in inrRNA:
line = line.rstrip()
rRNAset.add(line)

# Parse the fasta file


rRNAfound = False # Test if the sequence is a rRNA
with open(sys.argv[2], 'r') as inFasta, open(sys.argv[3], 'w') as outFasta:
for line in inFasta:
line = line.rstrip()
if line[0] == '>':
seqid = line.split()[0]
seqid = seqid[1:] # Remove the >
found = False
if seqid in rRNAset:
found = True
continue
if found is False:
print(line, file=outFasta)

12
23.py
Code

#!/usr/bin/python3

import sys

if len(sys.argv) != 2:
print('Usage: {} gene_fasta_file'.format(sys.argv[0]))

codon_total_count = [0, 0, 0]
codon_gc_count = [0, 0, 0]
allowed_letters = ['a', 'c', 'g', 't']
gc_letters = ['c', 'g']

countLines = 0
with open(sys.argv[1], 'r') as in_fh:
for line in in_fh:
line = line.lower().rstrip()

countLines += 1
if countLines % 10000 == 0:
print('{} lines processed'.format(countLines), file=sys.stderr, end='\r')

if not line[0] == '>':


for n in range(len(line)):

# Get the current codon position, here 0, 1 or 2


current_pos = n % 3
letter = line[n]

if letter in allowed_letters:
codon_total_count[current_pos] += 1

if letter in gc_letters:
codon_gc_count[current_pos] += 1

for pos in range(3):


gc_frac = codon_gc_count[pos] / codon_total_count[pos]
print('GC content for position {} is {}%'.format(pos+1, round(100*gc_frac, 2)))

13

You might also like