solutionsExerciseMaster11 23

Suggested solutions exercises 11-23
11.py
Code
#!/usr/bin/python3
import sys
with open(sys.argv[1], 'r') as in_fh:
counter = 0
# You can test if a substring is found in a string in a similar way

# as you do a membership test for a list
if '.fastq' in sys.argv[1] or '.fq' in sys.argv[1]:
for line in in_fh:

counter += 1
print(int(counter/4))
else:
for line in in_fh:

if line.starswith('>'):
counter += 1
print(counter)
1
12.py
Code
#!/usr/bin/python3
# Exercise: Transcribe DNA fasta to RNA

# Author: Martin Basterrechea
import sys
if len(sys.argv) < 3:
print('Error: Not enough arguments. Usage: 12.py inputfile outputfile')
sys.exit()
inputname = sys.argv[1]
outputname = sys.argv[2]
with open(inputname, 'r') as inf, open(outputname, 'w') as outf:

for line in inf:
line = line.rstrip()
if line[0] != '>':
line = line.replace('t', 'u').replace('T', 'U')
print('{}'.format(line), file=outf)
else:
print('{}'.format(line), file=outf)
2
13.py
Code
#!/usr/bin/python3
# Exercise: Calculate GC content in multi-fasta file
import sys
GC = 0
AT = 0
print('Error: Not enough arguments. Usage: 13.py inputfile')
sys.exit()
with open(inputname, 'r') as inf:

for line in inf:
if line[0] != '>':
line = line.rstrip().lower()
# Don't use regular expressions for this, it's slower
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')
print(round(GC/(GC+AT), 2))
3
14.py
Code
#!/usr/bin/python3
# Exercise: Calculate average length of sequences in multi-fasta file
import sys
if len(sys.argv) < 2: # First argument is second in the sys.argv list!

print('Error: Not enough arguments. Usage: 14.py inputfile')
sys.exit()
# We get the sum of the sequence lengths and divide it by the number of sequences
total_length = 0
seq_count = 0

for line in inf:
if line[0] == '>':
seq_count += 1
else:
total_length += len(line)
print(round(total_length/seq_count,1))
4
15.py
Code
#!/usr/bin/python3
# Exercise: Calculate GC content in fastq file
import sys
GC = 0
AT = 0
print('Error: Not enough arguments. Usage: 15.py fastqFile')
sys.exit()
countLines = 0

for line in inf:
countLines += 1
if countLines % 2:
line = line.rstrip().lower()
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')
print(round(GC/(GC+AT), 2))
5
16.py
Code
#!/usr/bin/python3
import sys
import re
# Compiling the pattern is not necessary, but faster than using the string directly
lengthPattern = re.compile('length=(\d+)')
countId = 0
countNt = 0
headerLength = 0
for line in in_fh:

# Note that there is an empty line of the input file

# This is an empty string and evaluates to False
if line and line[0] == '>':
countId += 1
matchObject = re.search(lengthPattern, line)
headerLength += int(matchObject.group(1))
else:
countNt += len(line)
print('IDs: {}'.format(countId))
print('Total counted nucl: {} nt, Average: {} nt'.format(countNt, round(countNt / countId, 1)))
print('Total label length: {} nt, Average: {} nt'.format(headerLength, round(headerLength / cou
6
17.py
Code
#!/usr/bin/python3
import sys
import re
target_id = sys.argv[1]
found_id = False
'''
# A regular expression solution
# The added \s to the pattern match a trailing space, tab or newline,
# preventing a match with only part of the ID
search_pattern = re.compile('>' + target_id + '\s')

for line in in_fh:
if re.match(search_pattern, line):
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
break
'''
# A string solution, much faster for bigger files.

for line in in_fh:
# We split the line on () which means on any white space.
# We return the first element. If there was no whitespace
# the entire line will be in the first element
if '>' + target_id in line.split()[0]:
found_id_line = line
found_id = True
elif found_id is True:
break
if found_id is False: # This if statement is really not needed

print('The ID {} was not found'.format(target_id))
7
18.py
Code
#!/usr/bin/python3
#Usage: ./18.py input_fasta
i = 0
with open(sys.argv[1], 'r') as f:
for line in f:
i += 1
if '>' in line:
if i == 1:
print(line)
else:
print('\n' + line)
else:
print(line, end='')
print() # Print a final newline
8
19.py
Code
#!/usr/bin/python3
# Exercise: Transcribe DNA fasta to RNA
import sys
mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,

'Y' : 163.1760, 'M' : 131.1926, 'G' : 57.0519, 'H' : 137.1411,
'C' : 103.1388, 'E' : 129.1155, 'S' : 87.0782, 'F' : 147.1766,
'I' : 113.1594, 'A' : 71.0788, 'W' : 186.2132, 'N' : 114.1038,
'Q' : 128.1307, 'L' : 113.1594, 'R' : 156.1875, 'K' : 128.1741 }
prot = input('Enter an amino acid sequence: ')
total_mass = 0
for aa in prot:
# This will return the value if it exists, or None if it doesn't
mass = mass_dict.get(aa.upper())
if mass: #Any number (except 0) will be considered as true
total_mass += mass
else: #None is considered as false
print('Invalid amino acid!: {}'.format(aa))
sys.exit()
print('Total mass: ' + str(total_mass))
9
20.py
Code
#!/usr/bin/python3
#Usage: ./20_getSeq.py oligo fasta_file

#Expects single line fasta format
import sys
import re
targetOligo = sys.argv[1]
reverseOligo = targetOligo[::-1]
reverseOligo = reverseOligo.translate(str.maketrans('acgtACGT', 'tgcaTGCA'))
print('#Oligo\t{}'.format(targetOligo))
print('#id\tabundance')
for line in in_fh:
if line[0] == '>':
line = line.rstrip() # Save for future use
idLine = line.split()[0]
numberOfOligos = 0
else:
numberOfOligos += len(re.findall(targetOligo, line, re.I))
numberOfOligos += len(re.findall(reverseOligo, line, re.I))
if numberOfOligos > 0:
print('{}\t{}'.format(idLine, numberOfOligos))
10
21.py
Code
#!/usr/bin/env python3
#Exercise: Translate DNA into AA
#Note that this script only works with non interleaved sequences
import sys
print('Error, not enough arguments. Usage: 21.py input.fasta output.faa')
sys.exit()
gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',

'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',
'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',
'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'}
with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:

for line in inf:
if line[0] == '>':
print(line, file = outf)
else:
### Translation block
aa_seq = '' # Remember to empty the variable for each new translation
for n in range(0,len(line), 3):
codon = line[n:n+3].upper()
#False at the end if the sequence length is not a multiple of 3
if len(codon) == 3:
aa_seq += gen_code[codon]
print('{}'.format(aa_seq), file = outf)

### End of translation block
11
22.py
Code
#!/usr/bin/python3
# Runs as 22.py rRNAlistFile inFastaFile outFastaFile
import sys
# We create a set of the ids found in the rRNA list

rRNAset = set()
with open(sys.argv[1], 'r') as inrRNA:
for line in inrRNA:
rRNAset.add(line)
# Parse the fasta file

rRNAfound = False # Test if the sequence is a rRNA
with open(sys.argv[2], 'r') as inFasta, open(sys.argv[3], 'w') as outFasta:
for line in inFasta:
if line[0] == '>':
seqid = line.split()[0]
seqid = seqid[1:] # Remove the >
found = False
if seqid in rRNAset:
found = True
continue
if found is False:
print(line, file=outFasta)
12
23.py
Code
#!/usr/bin/python3
import sys
if len(sys.argv) != 2:
print('Usage: {} gene_fasta_file'.format(sys.argv[0]))
codon_total_count = [0, 0, 0]
codon_gc_count = [0, 0, 0]
allowed_letters = ['a', 'c', 'g', 't']
gc_letters = ['c', 'g']
countLines = 0
for line in in_fh:
line = line.lower().rstrip()
countLines += 1
if countLines % 10000 == 0:
print('{} lines processed'.format(countLines), file=sys.stderr, end='\r')
if not line[0] == '>':

for n in range(len(line)):
# Get the current codon position, here 0, 1 or 2

current_pos = n % 3
letter = line[n]
if letter in allowed_letters:
codon_total_count[current_pos] += 1
if letter in gc_letters:
codon_gc_count[current_pos] += 1
for pos in range(3):

gc_frac = codon_gc_count[pos] / codon_total_count[pos]
print('GC content for position {} is {}%'.format(pos+1, round(100*gc_frac, 2)))
13

solutionsExerciseMaster11 23

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

solutionsExerciseMaster11 23

Uploaded by

Copyright:

Available Formats

Suggested solutions exercises 11-23

with open(sys.argv[1], 'r') as in_fh:

# You can test if a substring is found in a string in a similar way

for line in in_fh:

for line in in_fh:

# Exercise: Transcribe DNA fasta to RNA

with open(inputname, 'r') as inf, open(outputname, 'w') as outf:

with open(inputname, 'r') as inf:

if len(sys.argv) < 2: # First argument is second in the sys.argv list!

with open(inputname, 'r') as inf:

with open(inputname, 'r') as inf:

with open(sys.argv[1], 'r') as in_fh:

for line in in_fh:

# Note that there is an empty line of the input file

with open(sys.argv[2], 'r') as in_fh:

# A string solution, much faster for bigger files.

if found_id is False: # This if statement is really not needed

#Usage: ./18.py input_fasta

print() # Print a final newline

mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,

prot = input('Enter an amino acid sequence: ')

#Usage: ./20_getSeq.py oligo fasta_file

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:

gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',

with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:

print('{}'.format(aa_seq), file = outf)

# Runs as 22.py rRNAlistFile inFastaFile outFastaFile

# We create a set of the ids found in the rRNA list

# Parse the fasta file

if not line[0] == '>':

# Get the current codon position, here 0, 1 or 2

for pos in range(3):

You might also like