# Data wrangling: strings to numbers
Many datasets have numerical values encoded in strings which need to be converted intonumbers for analysis

In [1]:
data = '40k'

In [2]:
data.split('k')

['40', '']

In [3]:
int(data.split('k')[0])*1000

40000

In [4]:
datalist = ['40k', '31k', '12k']

### For use with Table columns or other array data

In [3]:
from datascience import *
import numpy as np

In [6]:
from datascience import *
t = Table().with_columns('index',[0,1,2],'amount',datalist)
t

index,amount
0,40k
1,31k
2,12k


In [7]:
t = t.with_columns('value',[int(data.split('k')[0])*1000 for data in datalist])
t

index,amount,value
0,40k,40000
1,31k,31000
2,12k,12000


#### Float data embeded in string within table

In [2]:
datalist = ['4.01k', '3.11k', '1.25k']

In [3]:
[float(data.split('k')[0])*1000 for data in datalist]

[4010.0, 3110.0, 1250.0]

In [4]:
salary = Table().with_columns('position',['Data scientist','Chemist','Chemist','Biologist','Physicist','Finance'],
                              'salary',['75k','102,500','99k','103k','99,000','34,000'])
salary

position,salary
Data scientist,75k
Chemist,102500
Chemist,99k
Biologist,103k
Physicist,99000
Finance,34000


In [6]:
[int(data.replace(',','').split('k')[0])*1000 for data in salary.column('salary')]

[75000, 102500000, 99000, 103000, 99000000, 34000000]

### Working with time strings

In [8]:
time_values = ['12:03:56', '01:04:23', '03:35:00']

In [9]:
t = t.with_columns('time',time_values,'hour',[int(data.split(':')[0]) for data in time_values])
t

index,amount,value,time,hour
0,40k,40000,12:03:56,12
1,31k,31000,01:04:23,1
2,12k,12000,03:35:00,3
