Data wrangling: strings to numbers#

Many datasets have numerical values encoded in strings which need to be converted intonumbers for analysis

data = '40k'
data.split('k')
['40', '']
int(data.split('k')[0])*1000
40000
datalist = ['40k', '31k', '12k']

For use with Table columns or other array data#

from datascience import *
import numpy as np
from datascience import *
t = Table().with_columns('index',[0,1,2],'amount',datalist)
t
index amount
0 40k
1 31k
2 12k
t = t.with_columns('value',[int(data.split('k')[0])*1000 for data in datalist])
t
index amount value
0 40k 40000
1 31k 31000
2 12k 12000

Float data embeded in string within table#

datalist = ['4.01k', '3.11k', '1.25k']
[float(data.split('k')[0])*1000 for data in datalist]
[4010.0, 3110.0, 1250.0]
salary = Table().with_columns('position',['Data scientist','Chemist','Chemist','Biologist','Physicist','Finance'],
                              'salary',['75k','102,500','99k','103k','99,000','34,000'])
salary
position salary
Data scientist 75k
Chemist 102,500
Chemist 99k
Biologist 103k
Physicist 99,000
Finance 34,000
[int(data.replace(',','').split('k')[0])*1000 for data in salary.column('salary')]
[75000, 102500000, 99000, 103000, 99000000, 34000000]

Working with time strings#

time_values = ['12:03:56', '01:04:23', '03:35:00']
t = t.with_columns('time',time_values,'hour',[int(data.split(':')[0]) for data in time_values])
t
index amount value time hour
0 40k 40000 12:03:56 12
1 31k 31000 01:04:23 1
2 12k 12000 03:35:00 3