Skip to content

Commit

Permalink
clustering!
Browse files Browse the repository at this point in the history
  • Loading branch information
bopjesvla committed Feb 8, 2025
1 parent cd1747c commit 28d8539
Show file tree
Hide file tree
Showing 7 changed files with 304 additions and 63 deletions.
10 changes: 7 additions & 3 deletions cron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
* * * * * cd hitch && /usr/bin/flock -n /tmp/show.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/show.py' > cronlog.txt 2>&1
# every 10 minutes
*/10 * * * * cd hitch && /usr/bin/flock -n /tmp/show.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/show.py light' > cronlog-light.txt 2>&1
# each day at 6
0 6 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dump.py' > dumplog.txt 2>&1
# each day at 3
0 3 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/fetch-roads.py' > fetchroadlog.txt 2>&1
# each day at midnight
0 0 * * * cd hitch && /usr/bin/flock -n /tmp/dump.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dump.py' > dumplog.txt 2>&1
# every day at midnight
0 0 * * * cd hitch && /usr/bin/flock -n /tmp/dashboard.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dashboard.py' > dashboard.txt 2>&1
0 0 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/fetch-areas.py' > fetcharealog.txt 2>&1
# every hour
0 * * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dashboard.py' > dashboard.txt 2>&1
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ dash==2.17.1
Flask-Mailman==1.1.1
Flask-Security==5.5.2
Flask-SQLAlchemy==3.1.1
Flask-WTF==1.2.2
Flask-WTF==1.2.2
folium==0.19.4
networkx==3.2.1
numpy==2.2.2
pandas==2.2.3
geopandas==1.0.1
plotly==5.23.0
pycountry==24.6.1
ruff==0.9.3
shapely==2.0.6
requests-cache==0.6.4
scikit-learn==1.6.1
21 changes: 9 additions & 12 deletions scripts/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,16 @@
print(f"DB not found: {DATABASE}")
exit()

all_points = pd.read_sql(
"select * from points where not banned", sqlite3.connect(DATABASE)
)
all_points = pd.read_sql("select * from points where not banned", sqlite3.connect(DATABASE))
all_points["ip"] = ""
all_points.to_sql(
"points", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace"
)
all_points.to_sql("points", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")


duplicates = pd.read_sql(
"select * from duplicates where reviewed = accepted", sqlite3.connect(DATABASE)
)
duplicates = pd.read_sql("select * from duplicates where reviewed = accepted", sqlite3.connect(DATABASE))
duplicates["ip"] = ""
duplicates.to_sql(
"duplicates", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace"
)
duplicates.to_sql("duplicates", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
service_areas = pd.read_sql("select * from service_areas", sqlite3.connect(DATABASE))
service_areas.to_sql("service_areas", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")

road_islands = pd.read_sql("select * from road_islands", sqlite3.connect(DATABASE))
road_islands.to_sql("road_islands", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
111 changes: 111 additions & 0 deletions scripts/fetch-areas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import requests_cache
import requests
import pandas as pd
from scipy.spatial import cKDTree

Check failure on line 4 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:4:27: F401 `scipy.spatial.cKDTree` imported but unused
from shapely.geometry import Point, LineString, Polygon, MultiLineString

Check failure on line 5 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:5:37: F401 `shapely.geometry.LineString` imported but unused

Check failure on line 5 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:5:58: F401 `shapely.geometry.MultiLineString` imported but unused
from shapely.ops import nearest_points

Check failure on line 6 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:6:25: F401 `shapely.ops.nearest_points` imported but unused
import shapely
import os
import sqlite3

Check failure on line 9 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:9:8: F401 `sqlite3` imported but unused
import time
import numpy as np

Check failure on line 11 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:11:17: F401 `numpy` imported but unused
import networkx

Check failure on line 12 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-areas.py:12:8: F401 `networkx` imported but unused
from helpers import get_db, scripts_dir
from sklearn.cluster import DBSCAN

cache_file = os.path.join(scripts_dir, "overpass_api_cache")
requests_cache.install_cache(cache_file, backend="sqlite", expire_after=6 * 365 * 24 * 60 * 60)

points = pd.read_sql("select * from points where not banned", get_db())

# Load coordinates (Assuming 'points' DataFrame exists with "lon" and "lat")
coords = points[["lon", "lat"]].drop_duplicates().reset_index(drop=True)

# Candidate clustering with DBSCAN
# This clustering is purely spatial, not OSM aware at all
AREA_MERGE_DISTANCE = 800
AREA_MERGE_DISTANCE_DEG = AREA_MERGE_DISTANCE / 111_000 # 111km per degree
min_samples = 2 # Minimum points to form a cluster
dbscan = DBSCAN(eps=AREA_MERGE_DISTANCE_DEG, min_samples=min_samples, metric="euclidean").fit(coords)

coords["cluster"] = dbscan.labels_

print(sum(coords["cluster"] != -1), len(coords))

# Filter out the loners
clusters = coords[coords["cluster"] != -1]


def get_service_area(lat, lon):
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
is_in({lat}, {lon})->.a;
(
area.a["amenity"="fuel"];
area.a["highway"="service_area"];
area.a["highway"="rest_area"];
area.a["highway"="parking"];
area.a["highway"="services"];
);
wr(pivot);
out geom;
"""
# Query Overpass API
data = None
while data is None:
try:
response = requests.get(overpass_url, params={"data": overpass_query})
if not response.from_cache:
print("getting service area", lat, lon)
time.sleep(1)
data = response.json()
except Exception as e:
print(e)
pass
max_size = -1
largest_geom = largest_geom_id = None

if "elements" not in data:
return None

# Convert results into polygons and check size
point = Point(lon, lat)

Check failure on line 73 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F841)

scripts/fetch-areas.py:73:5: F841 Local variable `point` is assigned to but never used
for element in data["elements"]:
if "geometry" in element:
coords = [(node["lon"], node["lat"]) for node in element["geometry"]]

if len(coords) < 3:
continue

polygon = Polygon(coords)
size = polygon.area
elif "members" in element:
size = 0
for member in element["members"]:
coords = [(node["lon"], node["lat"]) for node in member["geometry"]]
if len(coords) < 3:
continue
polygon = Polygon(coords)
size += polygon.area
else:
continue

if size > max_size: # Check if this is the largest containing parking/station
max_size = size
largest_geom_id = element["id"]
largest_geom = polygon

if largest_geom:
print("SERVICE", largest_geom_id)
return largest_geom_id, largest_geom


areas = []
for lon, lat, cluster in clusters.values:

Check failure on line 105 in scripts/fetch-areas.py

View workflow job for this annotation

GitHub Actions / build

Ruff (B007)

scripts/fetch-areas.py:105:15: B007 Loop control variable `cluster` not used within loop body
geom_id, geom = get_service_area(lat, lon)
if geom is not None:
areas.append((geom_id, shapely.convex_hull(geom).wkt))

areas_df = pd.DataFrame(areas, columns=["geom_id", "geometry_wkt"]).drop_duplicates("geometry_wkt")
areas_df.to_sql("service_areas", get_db(), if_exists="replace", index=False)
102 changes: 102 additions & 0 deletions scripts/fetch-roads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
import requests_cache
import requests
import sqlite3

Check failure on line 4 in scripts/fetch-roads.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

scripts/fetch-roads.py:4:8: F401 `sqlite3` imported but unused
from shapely.geometry import MultiLineString, LineString, Point, Polygon
import shapely
from sklearn.cluster import DBSCAN
import os
import time

from helpers import get_db, scripts_dir

cache_file = os.path.join(scripts_dir, "overpass_api_cache")
requests_cache.install_cache(cache_file, backend="sqlite", expire_after=6 * 365 * 24 * 60 * 60)

points = pd.read_sql("select * from points where not banned", get_db())

# Load coordinates (Assuming 'points' DataFrame exists with "lon" and "lat")
coords = points[["lon", "lat"]].drop_duplicates().reset_index(drop=True)

# Candidate clustering with DBSCAN
# This clustering is purely spatial, not OSM aware at all
ROAD_MERGE_DISTANCE = 100
ROAD_MERGE_DISTANCE_DEG = ROAD_MERGE_DISTANCE / 111_000 # 111km per degree
min_samples = 2 # Minimum points to form a cluster
dbscan = DBSCAN(eps=ROAD_MERGE_DISTANCE_DEG, min_samples=min_samples, metric="euclidean").fit(coords)

# Assign cluster labels
coords["cluster"] = dbscan.labels_

# Filter out the loners
clusters = coords[coords["cluster"] != -1]


# Overpass Query Function
def fetch_osm_data(lat, lon, search_size):
query = f"""
[out:json];
way(around:{search_size},{lat},{lon})["highway"~"motorway|trunk|primary|secondary|tertiary|unclassified|residential|service"];
(._;>;);
out body geom;
"""
url = "http://overpass-api.de/api/interpreter"

while True:
try:
response = requests.get(url, params={"data": query})
if not response.from_cache:
time.sleep(1)
print(f"fetching for {lat}, {lon}")
return response.json()
except Exception as e:
print(e)


# Process clusters
road_networks = []
road_islands = []
road_island_id = 0

grouped = clusters.groupby("cluster")
print(len(grouped))
for cluster_id, group in grouped:
lat, lon = group["lat"].mean(), group["lon"].mean()
search_size_deg = 1.2 * (group["lat"].max() - group["lat"].min() + group["lon"].max() - group["lon"].min())
search_size = search_size_deg * 111_000

osm_data = fetch_osm_data(lat, lon, search_size)

if osm_data and "elements" in osm_data:
lines = []
for element in osm_data["elements"]:
if "geometry" in element:
line_coords = [(pt["lon"], pt["lat"]) for pt in element["geometry"]]
lines.append(LineString(line_coords))

if lines:
multilinestring = MultiLineString(lines)
geom_wkt = multilinestring.wkt
road_networks.append((lat, lon, search_size_deg, geom_wkt))

# create perimeter
perimeter = Point(lon, lat).buffer(search_size_deg / 1.1, quad_segs=4)

roads_in_perimeter = multilinestring.intersection(perimeter)

road_network_with_boundary = shapely.unary_union([perimeter.boundary, roads_in_perimeter], grid_size=0.000001)
# Each "hole" in the road network is its own road_island
road_island_collection = shapely.polygonize([road_network_with_boundary])

for road_island in road_island_collection.geoms:
road_islands.append((road_island_id, road_island.wkt))
road_island_id += 1


# Convert to DataFrame
road_networks_df = pd.DataFrame(road_networks, columns=["lat", "lon", "search_size_deg", "geometry_wkt"])
road_islands_df = pd.DataFrame(road_islands, columns=["id", "geometry_wkt"]).drop_duplicates("geometry_wkt")

# Store in SQLite Database
road_networks_df.to_sql("road_networks", get_db(), if_exists="replace", index=False)
road_islands_df.to_sql("road_islands", get_db(), if_exists="replace", index=False)
10 changes: 9 additions & 1 deletion scripts/helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os

import sqlite3
import numpy as np


Expand Down Expand Up @@ -36,6 +36,14 @@ def get_bearing(lon1, lat1, lon2, lat2):
return brng


def get_db():
if os.path.exists(os.path.join(db_dir, "prod-points.sqlite")):
DATABASE = os.path.join(db_dir, "prod-points.sqlite")
else:
DATABASE = os.path.join(db_dir, "points.sqlite")
return sqlite3.connect(DATABASE)


scripts_dir = os.path.dirname(__file__)
root_dir = os.path.join(scripts_dir, "..")
db_dir = os.path.abspath(os.path.join(root_dir, "db"))
Expand Down
Loading

0 comments on commit 28d8539

Please sign in to comment.