{
"cells": [
{
"cell_type": "markdown",
"id": "d9fcf68b-1d29-4349-83c5-a7d5a2f67730",
"metadata": {},
"source": [
"# Data wrangling: strings to numbers\n",
"Many datasets have numerical values encoded in strings which need to be converted intonumbers for analysis"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d94c7198-be94-4367-ac5b-5c1c0df053e4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"data = '40k'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6a7d966f-9e7c-4055-9da5-b586f43f7079",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['40', '']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.split('k')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9f2829ec-3d7b-4d94-a381-0ac1ba39357c",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"40000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"int(data.split('k')[0])*1000"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c69f6346-a94a-4f0c-8625-db3ad870e7a4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"datalist = ['40k', '31k', '12k']"
]
},
{
"cell_type": "markdown",
"id": "e3cad0ec-807e-4ff3-9afb-51e8c99623fb",
"metadata": {},
"source": [
"### For use with Table columns or other array data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1a510924-621f-4d61-9842-24700672ffc1",
"metadata": {},
"outputs": [],
"source": [
"from datascience import *\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "501546d3-250b-4e1e-943e-5c6db20c418a",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" index | amount | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | 40k | \n",
"
\n",
" \n",
" 1 | 31k | \n",
"
\n",
" \n",
" 2 | 12k | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"index | amount\n",
"0 | 40k\n",
"1 | 31k\n",
"2 | 12k"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datascience import *\n",
"t = Table().with_columns('index',[0,1,2],'amount',datalist)\n",
"t"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e06815b9-60c4-4b3d-be6f-23ec9ea33764",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" index | amount | value | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | 40k | 40000 | \n",
"
\n",
" \n",
" 1 | 31k | 31000 | \n",
"
\n",
" \n",
" 2 | 12k | 12000 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"index | amount | value\n",
"0 | 40k | 40000\n",
"1 | 31k | 31000\n",
"2 | 12k | 12000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = t.with_columns('value',[int(data.split('k')[0])*1000 for data in datalist])\n",
"t"
]
},
{
"cell_type": "markdown",
"id": "f207726b-05ca-4621-91b7-6b57200abf83",
"metadata": {},
"source": [
"#### Float data embeded in string within table"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ffaf6389-ddb0-4135-9655-aa2397f56de9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"datalist = ['4.01k', '3.11k', '1.25k']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "28422959-1078-42f2-b265-358f749f1b30",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[4010.0, 3110.0, 1250.0]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[float(data.split('k')[0])*1000 for data in datalist]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ebf73b84-ff5e-46be-afd5-3780fbb97b5a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" position | salary | \n",
"
\n",
" \n",
" \n",
" \n",
" Data scientist | 75k | \n",
"
\n",
" \n",
" Chemist | 102,500 | \n",
"
\n",
" \n",
" Chemist | 99k | \n",
"
\n",
" \n",
" Biologist | 103k | \n",
"
\n",
" \n",
" Physicist | 99,000 | \n",
"
\n",
" \n",
" Finance | 34,000 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"position | salary\n",
"Data scientist | 75k\n",
"Chemist | 102,500\n",
"Chemist | 99k\n",
"Biologist | 103k\n",
"Physicist | 99,000\n",
"Finance | 34,000"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"salary = Table().with_columns('position',['Data scientist','Chemist','Chemist','Biologist','Physicist','Finance'],\n",
" 'salary',['75k','102,500','99k','103k','99,000','34,000'])\n",
"salary"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a692d772-fab9-49f7-b3af-e06e9f7c1396",
"metadata": {
"jupyter": {
"source_hidden": true
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[75000, 102500000, 99000, 103000, 99000000, 34000000]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[int(data.replace(',','').split('k')[0])*1000 for data in salary.column('salary')]"
]
},
{
"cell_type": "markdown",
"id": "1aa5103f-c3d1-4539-9771-5c35e9c9e5c6",
"metadata": {},
"source": [
"### Working with time strings"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "0dd43637-8566-4f1c-8ce9-9fe98e4fe63c",
"metadata": {},
"outputs": [],
"source": [
"time_values = ['12:03:56', '01:04:23', '03:35:00']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7cb0d25c-654e-4034-b3f8-6ade45d59f90",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" index | amount | value | time | hour | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | 40k | 40000 | 12:03:56 | 12 | \n",
"
\n",
" \n",
" 1 | 31k | 31000 | 01:04:23 | 1 | \n",
"
\n",
" \n",
" 2 | 12k | 12000 | 03:35:00 | 3 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"index | amount | value | time | hour\n",
"0 | 40k | 40000 | 12:03:56 | 12\n",
"1 | 31k | 31000 | 01:04:23 | 1\n",
"2 | 12k | 12000 | 03:35:00 | 3"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = t.with_columns('time',time_values,'hour',[int(data.split(':')[0]) for data in time_values])\n",
"t"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}