{ "cells": [ { "cell_type": "markdown", "id": "d9fcf68b-1d29-4349-83c5-a7d5a2f67730", "metadata": {}, "source": [ "# Data wrangling: strings to numbers\n", "Many datasets have numerical values encoded in strings which need to be converted intonumbers for analysis" ] }, { "cell_type": "code", "execution_count": 1, "id": "d94c7198-be94-4367-ac5b-5c1c0df053e4", "metadata": { "tags": [] }, "outputs": [], "source": [ "data = '40k'" ] }, { "cell_type": "code", "execution_count": 2, "id": "6a7d966f-9e7c-4055-9da5-b586f43f7079", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['40', '']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.split('k')" ] }, { "cell_type": "code", "execution_count": 3, "id": "9f2829ec-3d7b-4d94-a381-0ac1ba39357c", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "40000" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "int(data.split('k')[0])*1000" ] }, { "cell_type": "code", "execution_count": 4, "id": "c69f6346-a94a-4f0c-8625-db3ad870e7a4", "metadata": { "tags": [] }, "outputs": [], "source": [ "datalist = ['40k', '31k', '12k']" ] }, { "cell_type": "markdown", "id": "e3cad0ec-807e-4ff3-9afb-51e8c99623fb", "metadata": {}, "source": [ "### For use with Table columns or other array data" ] }, { "cell_type": "code", "execution_count": 3, "id": "1a510924-621f-4d61-9842-24700672ffc1", "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 6, "id": "501546d3-250b-4e1e-943e-5c6db20c418a", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index amount
0 40k
1 31k
2 12k
" ], "text/plain": [ "index | amount\n", "0 | 40k\n", "1 | 31k\n", "2 | 12k" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datascience import *\n", "t = Table().with_columns('index',[0,1,2],'amount',datalist)\n", "t" ] }, { "cell_type": "code", "execution_count": 7, "id": "e06815b9-60c4-4b3d-be6f-23ec9ea33764", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index amount value
0 40k 40000
1 31k 31000
2 12k 12000
" ], "text/plain": [ "index | amount | value\n", "0 | 40k | 40000\n", "1 | 31k | 31000\n", "2 | 12k | 12000" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = t.with_columns('value',[int(data.split('k')[0])*1000 for data in datalist])\n", "t" ] }, { "cell_type": "markdown", "id": "f207726b-05ca-4621-91b7-6b57200abf83", "metadata": {}, "source": [ "#### Float data embeded in string within table" ] }, { "cell_type": "code", "execution_count": 2, "id": "ffaf6389-ddb0-4135-9655-aa2397f56de9", "metadata": { "tags": [] }, "outputs": [], "source": [ "datalist = ['4.01k', '3.11k', '1.25k']" ] }, { "cell_type": "code", "execution_count": 3, "id": "28422959-1078-42f2-b265-358f749f1b30", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[4010.0, 3110.0, 1250.0]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[float(data.split('k')[0])*1000 for data in datalist]" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebf73b84-ff5e-46be-afd5-3780fbb97b5a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
position salary
Data scientist 75k
Chemist 102,500
Chemist 99k
Biologist 103k
Physicist 99,000
Finance 34,000
" ], "text/plain": [ "position | salary\n", "Data scientist | 75k\n", "Chemist | 102,500\n", "Chemist | 99k\n", "Biologist | 103k\n", "Physicist | 99,000\n", "Finance | 34,000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "salary = Table().with_columns('position',['Data scientist','Chemist','Chemist','Biologist','Physicist','Finance'],\n", " 'salary',['75k','102,500','99k','103k','99,000','34,000'])\n", "salary" ] }, { "cell_type": "code", "execution_count": 6, "id": "a692d772-fab9-49f7-b3af-e06e9f7c1396", "metadata": { "jupyter": { "source_hidden": true }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[75000, 102500000, 99000, 103000, 99000000, 34000000]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[int(data.replace(',','').split('k')[0])*1000 for data in salary.column('salary')]" ] }, { "cell_type": "markdown", "id": "1aa5103f-c3d1-4539-9771-5c35e9c9e5c6", "metadata": {}, "source": [ "### Working with time strings" ] }, { "cell_type": "code", "execution_count": 8, "id": "0dd43637-8566-4f1c-8ce9-9fe98e4fe63c", "metadata": {}, "outputs": [], "source": [ "time_values = ['12:03:56', '01:04:23', '03:35:00']" ] }, { "cell_type": "code", "execution_count": 9, "id": "7cb0d25c-654e-4034-b3f8-6ade45d59f90", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index amount value time hour
0 40k 40000 12:03:56 12
1 31k 31000 01:04:23 1
2 12k 12000 03:35:00 3
" ], "text/plain": [ "index | amount | value | time | hour\n", "0 | 40k | 40000 | 12:03:56 | 12\n", "1 | 31k | 31000 | 01:04:23 | 1\n", "2 | 12k | 12000 | 03:35:00 | 3" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = t.with_columns('time',time_values,'hour',[int(data.split(':')[0]) for data in time_values])\n", "t" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 5 }