{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Create canonical labels attached to tessellation\n",
"\n",
"This notebook combines results of the first and second level clustering, generating a canonical singature type ID reflecting both levels."
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import dask.dataframe\n",
"import dask_geopandas"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"level1 = pd.read_parquet(\"../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
"source": [
"level1"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" kmeans10gb | \n",
"
\n",
" \n",
" hindex | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" c000e094707t0000 | \n",
" 4 | \n",
"
\n",
" \n",
" c000e094763t0000 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094763t0001 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094763t0002 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094764t0000 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" c102e644989t0111 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0112 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0113 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0114 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0115 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
14539578 rows × 1 columns
\n",
"
"
],
"text/plain": [
" kmeans10gb\n",
"hindex \n",
"c000e094707t0000 4\n",
"c000e094763t0000 0\n",
"c000e094763t0001 0\n",
"c000e094763t0002 0\n",
"c000e094764t0000 0\n",
"... ...\n",
"c102e644989t0111 0\n",
"c102e644989t0112 0\n",
"c102e644989t0113 0\n",
"c102e644989t0114 0\n",
"c102e644989t0115 0\n",
"\n",
"[14539578 rows x 1 columns]"
]
},
"metadata": {},
"execution_count": 3
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 4,
"source": [
"level2_9 = pd.read_parquet(\"../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq\", columns=['9'])"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 5,
"source": [
"level2_9"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 9 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 113639 | \n",
" 0 | \n",
"
\n",
" \n",
" 113640 | \n",
" 0 | \n",
"
\n",
" \n",
" 113641 | \n",
" 0 | \n",
"
\n",
" \n",
" 113642 | \n",
" 0 | \n",
"
\n",
" \n",
" 113643 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
113644 rows × 1 columns
\n",
"
"
],
"text/plain": [
" 9\n",
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"... ..\n",
"113639 0\n",
"113640 0\n",
"113641 0\n",
"113642 0\n",
"113643 0\n",
"\n",
"[113644 rows x 1 columns]"
]
},
"metadata": {},
"execution_count": 5
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"source": [
"level2_2 = pd.read_parquet(\"../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 7,
"source": [
"level2_2"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subclustering_cluster2_k3 | \n",
"
\n",
" \n",
" hindex | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" c000e097919t0003 | \n",
" 1 | \n",
"
\n",
" \n",
" c000e097919t0005 | \n",
" 1 | \n",
"
\n",
" \n",
" c000e097919t0008 | \n",
" 1 | \n",
"
\n",
" \n",
" c000e097919t0009 | \n",
" 1 | \n",
"
\n",
" \n",
" c000e097919t0015 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" c102e639766t0007 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e639766t0010 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e639766t0011 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e639766t0012 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e639766t0013 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
1115564 rows × 1 columns
\n",
"
"
],
"text/plain": [
" subclustering_cluster2_k3\n",
"hindex \n",
"c000e097919t0003 1\n",
"c000e097919t0005 1\n",
"c000e097919t0008 1\n",
"c000e097919t0009 1\n",
"c000e097919t0015 1\n",
"... ...\n",
"c102e639766t0007 0\n",
"c102e639766t0010 0\n",
"c102e639766t0011 0\n",
"c102e639766t0012 0\n",
"c102e639766t0013 0\n",
"\n",
"[1115564 rows x 1 columns]"
]
},
"metadata": {},
"execution_count": 7
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"source": [
"labels = level1.copy()\n",
"labels[\"signature_type\"] = labels[\"kmeans10gb\"]\n",
"labels.loc[labels.kmeans10gb == 9, \"signature_type\"] = (labels.loc[labels.kmeans10gb == 9, \"signature_type\"] * 10) + level2_9[\"9\"].values\n",
"labels.loc[labels.kmeans10gb == 2, \"signature_type\"] = (labels.loc[labels.kmeans10gb == 2, \"signature_type\"] * 10) +level2_2[\"subclustering_cluster2_k3\"].values"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 9,
"source": [
"labels = labels.drop(columns=[\"kmeans10gb\"])\n",
"labels"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" signature_type | \n",
"
\n",
" \n",
" hindex | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" c000e094707t0000 | \n",
" 4 | \n",
"
\n",
" \n",
" c000e094763t0000 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094763t0001 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094763t0002 | \n",
" 0 | \n",
"
\n",
" \n",
" c000e094764t0000 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" c102e644989t0111 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0112 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0113 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0114 | \n",
" 0 | \n",
"
\n",
" \n",
" c102e644989t0115 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
14539578 rows × 1 columns
\n",
"
"
],
"text/plain": [
" signature_type\n",
"hindex \n",
"c000e094707t0000 4\n",
"c000e094763t0000 0\n",
"c000e094763t0001 0\n",
"c000e094763t0002 0\n",
"c000e094764t0000 0\n",
"... ...\n",
"c102e644989t0111 0\n",
"c102e644989t0112 0\n",
"c102e644989t0113 0\n",
"c102e644989t0114 0\n",
"c102e644989t0115 0\n",
"\n",
"[14539578 rows x 1 columns]"
]
},
"metadata": {},
"execution_count": 9
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 19,
"source": [
"labels.to_parquet(\"../../urbangrammar_samba/spatial_signatures/signatures/signatures_combined_levels_labels.pq\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"cells = dask_geopandas.read_parquet(\"../../urbangrammar_samba/spatial_signatures/tessellation/\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 11,
"source": [
"cells = cells.merge(labels, how=\"left\", left_on=\"hindex\", right_index=True)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 12,
"source": [
"cells = cells.drop(columns=\"buildings\")\n",
"cells = cells.rename(columns={\"tessellation\": \"geometry\"})\n",
"cells"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"Dask DataFrame Structure:
\n",
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" hindex | \n",
" geometry | \n",
" signature_type | \n",
"
\n",
" \n",
" npartitions=103 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" object | \n",
" object | \n",
" float64 | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
"
\n",
"
\n",
"Dask Name: rename, 413 tasks
"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" hindex geometry signature_type\n",
"npartitions=103 \n",
" object object float64\n",
" ... ... ...\n",
"... ... ... ...\n",
" ... ... ...\n",
" ... ... ...\n",
"Dask Name: rename, 413 tasks"
]
},
"metadata": {},
"execution_count": 12
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 13,
"source": [
"cells = dask_geopandas.from_dask_dataframe(cells)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 14,
"source": [
"cells"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"Dask-GeoPandas GeoDataFrame Structure:
\n",
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" hindex | \n",
" geometry | \n",
" signature_type | \n",
"
\n",
" \n",
" npartitions=103 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" object | \n",
" geometry | \n",
" float64 | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
"
\n",
"
\n",
"Dask Name: GeoDataFrame, 516 tasks
"
],
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 14
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 15,
"source": [
"%%time\n",
"cells.to_parquet(\"../../urbangrammar_samba/spatial_signatures/signatures/signatures_combined_tessellation/\")"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/opt/conda/lib/python3.8/site-packages/dask/utils.py:35: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata. This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec\n",
"\n",
"This metadata specification does not yet make stability promises. We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.\n",
"\n",
"To further ignore this warning, you can do: \n",
"import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')\n",
" return func(*args, **kwargs)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 17min 44s, sys: 2min 26s, total: 20min 11s\n",
"Wall time: 20min 11s\n"
]
}
],
"metadata": {
"tags": []
}
},
{
"cell_type": "markdown",
"source": [
"We can check the result and its spatial partitioning."
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 16,
"source": [
"cells = dask_geopandas.read_parquet(\"../../urbangrammar_samba/spatial_signatures/signatures/signatures_combined_tessellation/\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 17,
"source": [
"cells"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"Dask-GeoPandas GeoDataFrame Structure:
\n",
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" hindex | \n",
" geometry | \n",
" signature_type | \n",
"
\n",
" \n",
" npartitions=103 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" object | \n",
" geometry | \n",
" int32 | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
"
\n",
"
\n",
"Dask Name: read-parquet, 103 tasks
"
],
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 17
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 18,
"source": [
"%%time\n",
"cells.calculate_spatial_partitions()"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 27min 22s, sys: 34 s, total: 27min 57s\n",
"Wall time: 3min 22s\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 19,
"source": [
"cells.spatial_partitions"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 POLYGON Z ((339010.000 426700.000 0.000, 33596...\n",
"1 POLYGON Z ((459529.390 339487.330 0.000, 45951...\n",
"2 POLYGON Z ((380696.000 228900.000 0.000, 37766...\n",
"3 POLYGON Z ((363161.000 541631.000 0.000, 33680...\n",
"4 POLYGON Z ((431181.406 393369.256 0.000, 40940...\n",
" ... \n",
"98 POLYGON Z ((448173.875 209362.352 0.000, 44814...\n",
"99 POLYGON Z ((513455.520 192183.030 0.000, 50044...\n",
"100 POLYGON Z ((172420.000 603651.000 0.000, 16125...\n",
"101 POLYGON Z ((419376.825 272261.845 0.000, 41514...\n",
"102 POLYGON Z ((367870.000 68210.000 0.000, 367820...\n",
"Length: 103, dtype: geometry"
]
},
"metadata": {},
"execution_count": 19
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 20,
"source": [
"cells.spatial_partitions.plot(figsize=(12, 12), cmap=\"tab20\", alpha=.2)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 20
},
{
"output_type": "display_data",
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
}
}
],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}