Lesson 8: String methods notebook - Introduction to Python For Bioinformatics

my_str = 'The Dude abides.'

print("1", my_str[5])
print("2", my_str[:6])
print("3", my_str[::2])
print("4", my_str[::-1])
print("5", my_str[::-2])

1 u
2 The Du
3 TeDd bds
4 .sediba eduD ehT
5 .eiaeu h

def bit_of_string(my_str, start=0,
                  end=len(my_str), step=1):
    print(f"returning my_str[{start}:{end}:{step}]")
    return my_str[start:end:step]
print("a", bit_of_string(my_str, step=2))

returning my_str[0:16:2]
a TeDd bds

String Methods¶

# Define sequence
seq = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'

# Count G's and C's
seq.count('G') + seq.count('C')

16

my_name = 'Peter'
print(my_name.count('e'))
print('count the e characters in a literal',
      'Peter'.count('e'))

2
count the e characters in a literal 2

# methods are available based on the type of the value
# you're working with
my_num = 102
print(my_num.count(0))

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[5], line 4
      1 # methods are available based on the type of the value
      2 # you're working with
      3 my_num = 102
----> 4 print(my_num.count(0))

AttributeError: 'int' object has no attribute 'count'

# explaining *parameters aka *args
def my_print(prefix, *parameters):
    string_to_print = prefix + ':'
    for param in parameters:
        string_to_print += param
        string_to_print += ' '
    print(string_to_print)

my_print('Hello', 'world', 'today')

Hello:world today

def gc_content(dna_string):
    length = len(dna_string)
    gc_count = dna_string.count('G') + dna_string.count('C')
    return gc_count / length

my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
print('gc content of my_dna', gc_content(my_dna))

gc content of my_dna 0.5333333333333333

# more with .count()
my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
print('count GA', my_dna.count('GA'))

# count with substring again
print('count AA in AAAAAA', 'AAAAAA'.count('AA'))

count GA 2
count AA in AAAAAA 3

# find
my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
my_dna.find('AUG')

10

print('look for nonsense', my_dna.find('nonsense'))

look for nonsense -1

my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
pattern = 'AUG'
if my_dna.find(pattern) == -1:
    print(pattern, 'not found')
else:
    print(pattern, 'found at', my_dna.find(pattern))

AUG found at 10

my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
pattern = 'AUG'
if pattern not in my_dna:
    print(pattern, 'not found')
else:
    print(pattern, 'found at', my_dna.find(pattern))

AUG found at 10

my_dna.find?

# find
my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
for i, letter in enumerate(my_dna):
    print(i, letter)
found_at = my_dna.find('AUG')
print('AUG found at', found_at)
# look for the next 'AUG'
pattern = 'AUG'
pattern_length = len(pattern)
start_search_at = found_at + pattern_length
found_next = start_search_at + my_dna[start_search_at:].find(pattern)
print('AUG found next at', found_next)

0 G
1 A
2 C
3 A
4 G
5 A
6 C
7 U
8 C
9 C
10 A
11 U
12 G
13 C
14 A
15 C
16 G
17 U
18 G
19 G
20 G
21 U
22 A
23 U
24 C
25 A
26 U
27 G
28 U
29 C
AUG found at 10
AUG found next at 25

# rfind
my_dna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
pattern = 'AUG'
print(my_dna.rfind(pattern))

# reverse complement again
def complement_base(base):
    """Returns the Watson-Crick complement of a base."""
    # Convert to lowercase
    base = base.lower()
    
    if base == 'a':
        return 'T'
    elif base == 't':
        return 'A'
    elif base == 'g':
        return 'C'
    else:
        return 'G'

# .lower() and .upper()
my_name = 'Peter'
print(my_name.lower())
print(my_name.upper())

peter
PETER

# .replace
my_rna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
my_dna = my_rna.replace('U', 'T')
print('my_rna', my_rna)
print('my_dna', my_dna)

my_rna GACAGACUCCAUGCACGUGGGUAUCAUGUC
my_dna GACAGACTCCATGCACGTGGGTATCATGTC

my_rna = 'GACAGACUCCAUGCACGUGGGUAUCAUGUC'
my_dna = my_rna.replace('U', 'T')

def complement_seq(my_seq):
    my_seq = my_seq.upper()
    my_seq = my_seq.replace('G', 'c')
    my_seq = my_seq.replace('C', 'g')
    my_seq = my_seq.replace('T', 'a')
    my_seq = my_seq.replace('A', 't')
    return my_seq.upper()

print('my_dna              ', id(my_dna), my_dna)
my_dna = complement_seq(my_dna)
print('complement of my_dna', id(my_dna), my_dna)

my_dna               139504722109792 GACAGACTCCATGCACGTGGGTATCATGTC
complement of my_dna 139504722184432 CTGTCTGAGGTACGTGCACCCATAGTACAG

# .join()
'|'.join(['A','B','C'])

'A|B|C'

#.join again
'|'.join('ABC')

'A|B|C'

'one'.join('ABC')

'AoneBoneC'

# .split()
my_line = 'AC1354 1.2 654'
# split to list
data = my_line.split(' ')
print(data)

['AC1354', '1.2', '654']

data = my_line.split()
print(data)

['AC1354', '1.2', '654']

my_line2 = 'AC1354  1.2 654'
data = my_line2.split()
print(data)

['AC1354', '1.2', '654']

my_line2 = 'AC1354  1.2 654'
data = my_line2.split(' ')
print(data)

['AC1354', '', '1.2', '654']

# .join it
print(' '.join(data))

AC1354  1.2 654

Format and f-strings¶

my_str = """
Let's do a Mad Lib!
During this bootcamp, I feel {adjective}.
The instructors give us {plural_noun}.
""".format(adjective='truculent', plural_noun='haircuts')

print(my_str)


Let's do a Mad Lib!
During this bootcamp, I feel truculent.
The instructors give us haircuts.

distance = 113
town_name = 'Ceres'
my_str = 'The distance to {town} is {distance:04d} km'.format(town=town_name,
                                                              distance=distance)
print(my_str)

The distance to Ceres is 0113 km

first_name = 'Peter'
surname = 'van Heusden'
my_str = 'first name: {} surname: {}'.format(first_name, surname)
print(my_str)

first name: Peter surname: van Heusden

# f-string
first_name = 'Peter'
surname = 'van Heusden'
my_str = f'first name: {firstname} surname: {surname}'
print(my_str)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 4
      2 first_name = 'Peter'
      3 surname = 'van Heusden'
----> 4 my_str = f'first name: {firstname} surname: {surname}'
      5 print(my_str)

NameError: name 'firstname' is not defined

# f-string
first_name = 'Peter'
surname = 'van Heusden'
my_str = f'first name: {first_name} surname: {surname}'
print(my_str)

first name: Peter surname: van Heusden

# old school way - don't use, but you might find it in other code
my_str = 'first name: %s' % first_name
print(my_str)

first name: Peter