clustering!

bopjesvla · Feb 8, 2025 · 28d8539 · 28d8539
1 parent cd1747c
commit 28d8539
Show file tree

Hide file tree

Showing 7 changed files with 304 additions and 63 deletions.
diff --git a/cron.sh b/cron.sh
@@ -3,7 +3,11 @@
 * * * * * cd hitch && /usr/bin/flock -n /tmp/show.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/show.py' > cronlog.txt 2>&1
 # every 10 minutes
 */10 * * * * cd hitch && /usr/bin/flock -n /tmp/show.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/show.py light' > cronlog-light.txt 2>&1
+# each day at 6
+0 6 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dump.py' > dumplog.txt 2>&1
+# each day at 3
+0 3 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/fetch-roads.py' > fetchroadlog.txt 2>&1
 # each day at midnight
-0 0 * * * cd hitch && /usr/bin/flock -n /tmp/dump.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dump.py' > dumplog.txt 2>&1
-# every day at midnight
-0 0 * * * cd hitch && /usr/bin/flock -n /tmp/dashboard.lockfile bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dashboard.py' > dashboard.txt 2>&1
+0 0 * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/fetch-areas.py' > fetcharealog.txt 2>&1
+# every hour
+0 * * * * cd hitch && bash -c '. $HOME/.bashrc; /home/bob/.asdf/shims/python scripts/dashboard.py' > dashboard.txt 2>&1
diff --git a/requirements.txt b/requirements.txt
@@ -4,13 +4,15 @@ dash==2.17.1
 Flask-Mailman==1.1.1
 Flask-Security==5.5.2
 Flask-SQLAlchemy==3.1.1
-Flask-WTF==1.2.2 
+Flask-WTF==1.2.2
 folium==0.19.4
 networkx==3.2.1
 numpy==2.2.2
 pandas==2.2.3
+geopandas==1.0.1
 plotly==5.23.0
 pycountry==24.6.1
 ruff==0.9.3
 shapely==2.0.6
 requests-cache==0.6.4
+scikit-learn==1.6.1
diff --git a/scripts/dump.py b/scripts/dump.py
@@ -16,19 +16,16 @@
     print(f"DB not found: {DATABASE}")
     exit()
 
-all_points = pd.read_sql(
-    "select * from points where not banned", sqlite3.connect(DATABASE)
-)
+all_points = pd.read_sql("select * from points where not banned", sqlite3.connect(DATABASE))
 all_points["ip"] = ""
-all_points.to_sql(
-    "points", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace"
-)
+all_points.to_sql("points", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
 
 
-duplicates = pd.read_sql(
-    "select * from duplicates where reviewed = accepted", sqlite3.connect(DATABASE)
-)
+duplicates = pd.read_sql("select * from duplicates where reviewed = accepted", sqlite3.connect(DATABASE))
 duplicates["ip"] = ""
-duplicates.to_sql(
-    "duplicates", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace"
-)
+duplicates.to_sql("duplicates", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
+service_areas = pd.read_sql("select * from service_areas", sqlite3.connect(DATABASE))
+service_areas.to_sql("service_areas", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
+
+road_islands = pd.read_sql("select * from road_islands", sqlite3.connect(DATABASE))
+road_islands.to_sql("road_islands", sqlite3.connect(DATABASE_DUMP), index=False, if_exists="replace")
diff --git a/scripts/fetch-areas.py b/scripts/fetch-areas.py
@@ -0,0 +1,111 @@
+import requests_cache
+import requests
+import pandas as pd
+from scipy.spatial import cKDTree
+from shapely.geometry import Point, LineString, Polygon, MultiLineString
+from shapely.ops import nearest_points
+import shapely
+import os
+import sqlite3
+import time
+import numpy as np
+import networkx
+from helpers import get_db, scripts_dir
+from sklearn.cluster import DBSCAN
+
+cache_file = os.path.join(scripts_dir, "overpass_api_cache")
+requests_cache.install_cache(cache_file, backend="sqlite", expire_after=6 * 365 * 24 * 60 * 60)
+
+points = pd.read_sql("select * from points where not banned", get_db())
+
+# Load coordinates (Assuming 'points' DataFrame exists with "lon" and "lat")
+coords = points[["lon", "lat"]].drop_duplicates().reset_index(drop=True)
+
+# Candidate clustering with DBSCAN
+# This clustering is purely spatial, not OSM aware at all
+AREA_MERGE_DISTANCE = 800
+AREA_MERGE_DISTANCE_DEG = AREA_MERGE_DISTANCE / 111_000  # 111km per degree
+min_samples = 2  # Minimum points to form a cluster
+dbscan = DBSCAN(eps=AREA_MERGE_DISTANCE_DEG, min_samples=min_samples, metric="euclidean").fit(coords)
+
+coords["cluster"] = dbscan.labels_
+
+print(sum(coords["cluster"] != -1), len(coords))
+
+# Filter out the loners
+clusters = coords[coords["cluster"] != -1]
+
+
+def get_service_area(lat, lon):
+    overpass_url = "http://overpass-api.de/api/interpreter"
+    overpass_query = f"""
+    [out:json];
+    is_in({lat}, {lon})->.a;
+    (
+        area.a["amenity"="fuel"];
+        area.a["highway"="service_area"];
+        area.a["highway"="rest_area"];
+        area.a["highway"="parking"];
+        area.a["highway"="services"];
+    );
+    wr(pivot);
+    out geom;
+    """
+    # Query Overpass API
+    data = None
+    while data is None:
+        try:
+            response = requests.get(overpass_url, params={"data": overpass_query})
+            if not response.from_cache:
+                print("getting service area", lat, lon)
+                time.sleep(1)
+            data = response.json()
+        except Exception as e:
+            print(e)
+            pass
+    max_size = -1
+    largest_geom = largest_geom_id = None
+
+    if "elements" not in data:
+        return None
+
+    # Convert results into polygons and check size
+    point = Point(lon, lat)
+    for element in data["elements"]:
+        if "geometry" in element:
+            coords = [(node["lon"], node["lat"]) for node in element["geometry"]]
+
+            if len(coords) < 3:
+                continue
+
+            polygon = Polygon(coords)
+            size = polygon.area
+        elif "members" in element:
+            size = 0
+            for member in element["members"]:
+                coords = [(node["lon"], node["lat"]) for node in member["geometry"]]
+                if len(coords) < 3:
+                    continue
+                polygon = Polygon(coords)
+                size += polygon.area
+        else:
+            continue
+
+        if size > max_size:  # Check if this is the largest containing parking/station
+            max_size = size
+            largest_geom_id = element["id"]
+            largest_geom = polygon
+
+    if largest_geom:
+        print("SERVICE", largest_geom_id)
+    return largest_geom_id, largest_geom
+
+
+areas = []
+for lon, lat, cluster in clusters.values:
+    geom_id, geom = get_service_area(lat, lon)
+    if geom is not None:
+        areas.append((geom_id, shapely.convex_hull(geom).wkt))
+
+areas_df = pd.DataFrame(areas, columns=["geom_id", "geometry_wkt"]).drop_duplicates("geometry_wkt")
+areas_df.to_sql("service_areas", get_db(), if_exists="replace", index=False)
diff --git a/scripts/fetch-roads.py b/scripts/fetch-roads.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import requests_cache
+import requests
+import sqlite3
+from shapely.geometry import MultiLineString, LineString, Point, Polygon
+import shapely
+from sklearn.cluster import DBSCAN
+import os
+import time
+
+from helpers import get_db, scripts_dir
+
+cache_file = os.path.join(scripts_dir, "overpass_api_cache")
+requests_cache.install_cache(cache_file, backend="sqlite", expire_after=6 * 365 * 24 * 60 * 60)
+
+points = pd.read_sql("select * from points where not banned", get_db())
+
+# Load coordinates (Assuming 'points' DataFrame exists with "lon" and "lat")
+coords = points[["lon", "lat"]].drop_duplicates().reset_index(drop=True)
+
+# Candidate clustering with DBSCAN
+# This clustering is purely spatial, not OSM aware at all
+ROAD_MERGE_DISTANCE = 100
+ROAD_MERGE_DISTANCE_DEG = ROAD_MERGE_DISTANCE / 111_000  # 111km per degree
+min_samples = 2  # Minimum points to form a cluster
+dbscan = DBSCAN(eps=ROAD_MERGE_DISTANCE_DEG, min_samples=min_samples, metric="euclidean").fit(coords)
+
+# Assign cluster labels
+coords["cluster"] = dbscan.labels_
+
+# Filter out the loners
+clusters = coords[coords["cluster"] != -1]
+
+
+# Overpass Query Function
+def fetch_osm_data(lat, lon, search_size):
+    query = f"""
+    [out:json];
+    way(around:{search_size},{lat},{lon})["highway"~"motorway|trunk|primary|secondary|tertiary|unclassified|residential|service"];
+    (._;>;);
+    out body geom;
+    """
+    url = "http://overpass-api.de/api/interpreter"
+
+    while True:
+        try:
+            response = requests.get(url, params={"data": query})
+            if not response.from_cache:
+                time.sleep(1)
+            print(f"fetching for {lat}, {lon}")
+            return response.json()
+        except Exception as e:
+            print(e)
+
+
+# Process clusters
+road_networks = []
+road_islands = []
+road_island_id = 0
+
+grouped = clusters.groupby("cluster")
+print(len(grouped))
+for cluster_id, group in grouped:
+    lat, lon = group["lat"].mean(), group["lon"].mean()
+    search_size_deg = 1.2 * (group["lat"].max() - group["lat"].min() + group["lon"].max() - group["lon"].min())
+    search_size = search_size_deg * 111_000
+
+    osm_data = fetch_osm_data(lat, lon, search_size)
+
+    if osm_data and "elements" in osm_data:
+        lines = []
+        for element in osm_data["elements"]:
+            if "geometry" in element:
+                line_coords = [(pt["lon"], pt["lat"]) for pt in element["geometry"]]
+                lines.append(LineString(line_coords))
+
+        if lines:
+            multilinestring = MultiLineString(lines)
+            geom_wkt = multilinestring.wkt
+            road_networks.append((lat, lon, search_size_deg, geom_wkt))
+
+            # create perimeter
+            perimeter = Point(lon, lat).buffer(search_size_deg / 1.1, quad_segs=4)
+
+            roads_in_perimeter = multilinestring.intersection(perimeter)
+
+            road_network_with_boundary = shapely.unary_union([perimeter.boundary, roads_in_perimeter], grid_size=0.000001)
+            # Each "hole" in the road network is its own road_island
+            road_island_collection = shapely.polygonize([road_network_with_boundary])
+
+            for road_island in road_island_collection.geoms:
+                road_islands.append((road_island_id, road_island.wkt))
+                road_island_id += 1
+
+
+# Convert to DataFrame
+road_networks_df = pd.DataFrame(road_networks, columns=["lat", "lon", "search_size_deg", "geometry_wkt"])
+road_islands_df = pd.DataFrame(road_islands, columns=["id", "geometry_wkt"]).drop_duplicates("geometry_wkt")
+
+# Store in SQLite Database
+road_networks_df.to_sql("road_networks", get_db(), if_exists="replace", index=False)
+road_islands_df.to_sql("road_islands", get_db(), if_exists="replace", index=False)
diff --git a/scripts/helpers.py b/scripts/helpers.py
@@ -1,5 +1,5 @@
 import os
-
+import sqlite3
 import numpy as np
 
 
@@ -36,6 +36,14 @@ def get_bearing(lon1, lat1, lon2, lat2):
     return brng
 
 
+def get_db():
+    if os.path.exists(os.path.join(db_dir, "prod-points.sqlite")):
+        DATABASE = os.path.join(db_dir, "prod-points.sqlite")
+    else:
+        DATABASE = os.path.join(db_dir, "points.sqlite")
+    return sqlite3.connect(DATABASE)
+
+
 scripts_dir = os.path.dirname(__file__)
 root_dir = os.path.join(scripts_dir, "..")
 db_dir = os.path.abspath(os.path.join(root_dir, "db"))