Question 1¶

line1 = 'QR57613 1.3 Serpentes Pythonidae'
parts = line1.split()
num_words = len(parts)
print(num_words)

Question 2¶

line2 = 'QR57613\t1.3\tSerpentes\tPythonidae\tPython regius'
fields = line2.split('\t')
num_fields = len(fields)
print(num_fields)

line3 = 'ABC^^DEF^^GHI'
print(line3.split('^^'))

['ABC', 'DEF', 'GHI']

Question 3¶

findhom_result = """#FINDHOM v 1.2:
Search results:
Query\tMatch fraction\tScore\tSubject
SMPL001\t0.7\t12331\tAQ10213 Phlebotomus perniciosus
SMPL003\t0.5\t6032\tBZ102363 Phlebotomus papatasi
SMPL004\t0.8\t13123\tRD178237 Sergentomyia dubia
SMPL007\t0.6\t10610\tBQ187981 Phlebotomus papatasi"""

findhom_result

'#FINDHOM v 1.2:\nSearch results:\nQuery\tMatch fraction\tScore\tSubject\nSMPL001\t0.7\t12331\tAQ10213 Phlebotomus perniciosus\nSMPL003\t0.5\t6032\tBZ102363 Phlebotomus papatasi\nSMPL004\t0.8\t13123\tRD178237 Sergentomyia dubia\nSMPL007\t0.6\t10610\tBQ187981 Phlebotomus papatasi'

lines = findhom_result.split('\n')  # split on newline
for line in lines:
    print("LINE:", line)

LINE: #FINDHOM v 1.2:
LINE: Search results:
LINE: Query	Match fraction	Score	Subject
LINE: SMPL001	0.7	12331	AQ10213 Phlebotomus perniciosus
LINE: SMPL003	0.5	6032	BZ102363 Phlebotomus papatasi
LINE: SMPL004	0.8	13123	RD178237 Sergentomyia dubia
LINE: SMPL007	0.6	10610	BQ187981 Phlebotomus papatasi

lines[-1]

'SMPL007\t0.6\t10610\tBQ187981 Phlebotomus papatasi'

print(lines[-1])

SMPL007	0.6	10610	BQ187981 Phlebotomus papatasi

Question 4¶

string1 = 'Mountain Goat'
string2 = 'Field Goat'
print(string1.startswith('Mountain'), 'Mountain')
print(string2.startswith('Mountain'), 'Mountain')

# there is also .endswith()

# is the word Mountain in string1 ?
got_mountain = 'Mountain' in string1
print(got_mountain)

True Mountain
False Mountain
True

total = 0
# convert findhom_results into lines
lines = findhom_results.__A____
# search through all of the lines
___B____
    # check if the line starts with 'SMPL'
    if __C___
        total += 1
print("total count of 'SMPL'", total)

  Cell In[10], line 7
    if __C___
    ^
IndentationError: unexpected indent

total = 0
# convert findhom_results into lines
lines = findhom_result.split('\n')
# search through all of the lines
for line in lines:
    print("LINE:", line)
    # check if the line starts with 'SMPL'
    if line.startswith('SMPL'):
        total += 1
print("total count of 'SMPL'", total)

LINE: #FINDHOM v 1.2:
LINE: Search results:
LINE: Query	Match fraction	Score	Subject
LINE: SMPL001	0.7	12331	AQ10213 Phlebotomus perniciosus
LINE: SMPL003	0.5	6032	BZ102363 Phlebotomus papatasi
LINE: SMPL004	0.8	13123	RD178237 Sergentomyia dubia
LINE: SMPL007	0.6	10610	BQ187981 Phlebotomus papatasi
total count of 'SMPL' 4

Question 5¶

result_count = 0
lines = findhom_result.split('\n')
for line in lines:
    if '\t' in line:
        result_count += 1
result_count = result_count - 1
print(result_count)

# version 2
result_count = 0
lines = findhom_result.split('\n')
for line in lines:
    fields = line.split('\t')
    # count all of the lines that have 4 tab seperated fields
    if len(fields) == 4:
        result_count += 1
result_count = result_count - 1
print(result_count)

# solve it using a "state machine" with two states: header found and header not found
count = 0
header_found = False
lines = findhom_result.split('\n')
for line in lines:
    if line.startswith("Query"):
        header_found = True
        continue          # skip the header line itself
    if header_found:
        count += 1

print(count)

# solve it using a "state machine" with two states: header found and header not found
count = 0
header_found = False
lines = findhom_result.split('\n')
for line in lines:
    if header_found:
        count += 1
    if line.startswith("Query"):
        header_found = True

print(count)