Major restructuring of the dataloading

OxfordRSE · Feb 17, 2025 · 8026380 · 8026380
1 parent 51b2c59
commit 8026380
Show file tree

Hide file tree

Showing 753 changed files with 11,970,656 additions and 154 deletions.
diff --git a/examples/clustering.ipynb b/examples/clustering.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,40 +12,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import community as community_louvain\n",
-    "import collections\n",
     "import numpy as np\n",
+    "from IPython.display import display\n",
+    "import polars as pl\n",
     "import pandas as pd\n",
     "import networkx as nx\n",
-    "import matplotlib.cm as cm\n",
     "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
+    "from typing import List, Tuple\n",
+    "from pathlib import Path\n",
     "import os\n",
     "import torch\n",
     "from torch import Tensor\n",
     "from torch_geometric.utils import to_networkx\n",
     "from torch_geometric.datasets import Planetoid\n",
+    "from torch_geometric.utils import coalesce\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "data_dir = \"./data\"\n",
     "os.makedirs(data_dir, exist_ok=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "#l2g related imports\n",
-    "from l2gv2.clustering import distributed_clustering\n",
-    "from l2gv2.network.tgraph import TGraph\n",
-    "from l2gv2.network.npgraph import NPGraph\n",
-    "from l2gv2.patch.patches import create_patch_data\n",
-    "from l2gv2.patch.clustering import hierarchical_clustering\n"
+    "from l2gv2.patch.clustering import louvain_clustering\n",
+    "from l2gv2.graphs.tgraph import TGraph\n",
+    "from l2gv2.graphs.npgraph import NPGraph\n",
+    "from l2gv2.patch.patches import create_patch_data\n"
    ]
   },
   {
@@ -78,75 +78,76 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from l2gv2.datasets import get_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = get_dataset(\"as-733\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph\n",
-      "Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index\n",
-      "Processing...\n",
-      "Done!\n"
+     "ename": "AttributeError",
+     "evalue": "'AS733Dataset' object has no attribute 'x'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[42], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset\u001b[38;5;241m.\u001b[39mx\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch_geometric/data/in_memory_dataset.py:318\u001b[0m, in \u001b[0;36mInMemoryDataset.__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    315\u001b[0m         data_list \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindices()]\n\u001b[1;32m    316\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m Batch\u001b[38;5;241m.\u001b[39mfrom_data_list(data_list)[key]\n\u001b[0;32m--> 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    319\u001b[0m                      \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'AS733Dataset' object has no attribute 'x'"
      ]
     }
    ],
    "source": [
-    "dataset = Planetoid(root=data_dir, name='Cora')\n",
-    "data = dataset[0]"
+    "dataset.x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch_geometric.data import InMemoryDataset, Dataset"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of nodes: 2708\n",
-      "Number of edges: 10556\n",
-      "Has isolated nodes: False\n",
-      "Has self-loops: False\n",
-      "Is undirected: True\n",
-      "Average node degree: 3.90\n"
+      "Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])\n"
      ]
     }
    ],
    "source": [
-    "print(f'Number of nodes: {data.num_nodes}')\n",
-    "print(f'Number of edges: {data.num_edges}')\n",
-    "print(f'Has isolated nodes: {data.has_isolated_nodes()}')\n",
-    "print(f'Has self-loops: {data.has_self_loops()}')\n",
-    "print(f'Is undirected: {data.is_undirected()}')\n",
-    "print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')"
+    "for d in dataset:\n",
+    "    print(d)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'pd' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[16], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m G \u001b[38;5;241m=\u001b[39m to_networkx(data, to_undirected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m      2\u001b[0m degrees \u001b[38;5;241m=\u001b[39m [val \u001b[38;5;28;01mfor\u001b[39;00m (node, val) \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()]\n\u001b[0;32m----> 3\u001b[0m display(pd\u001b[38;5;241m.\u001b[39mDataFrame(pd\u001b[38;5;241m.\u001b[39mSeries(degrees)\u001b[38;5;241m.\u001b[39mdescribe())\u001b[38;5;241m.\u001b[39mtranspose()\u001b[38;5;241m.\u001b[39mround(\u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(degrees))\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28msum\u001b[39m(degrees))\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "G = to_networkx(data, to_undirected=True)\n",
     "degrees = [val for (node, val) in G.degree()]\n",
@@ -161,17 +162,101 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(2, 10556)\n"
-     ]
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "G = to_networkx(data, to_undirected=True)\n",
+    "pos = nx.spring_layout(G, seed=42)\n",
+    "cent = nx.degree_centrality(G)\n",
+    "node_size = list(map(lambda x: x * 500, cent.values()))\n",
+    "cent_array = np.array(list(cent.values()))\n",
+    "threshold = sorted(cent_array, reverse=True)[10]\n",
+    "print(\"threshold\", threshold)\n",
+    "cent_bin = np.where(cent_array >= threshold, 1, 0.1)\n",
+    "plt.figure(figsize=(12, 12))\n",
+    "nodes = nx.draw_networkx_nodes(G, pos, node_size=node_size,\n",
+    "                               cmap=plt.cm.plasma,\n",
+    "                               node_color=cent_bin,\n",
+    "                               nodelist=list(cent.keys()),\n",
+    "                               alpha=cent_bin)\n",
+    "edges = nx.draw_networkx_edges(G, pos, width=0.25, alpha=0.3)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_dict = {\n",
+    "    0: \"Theory\",\n",
+    "    1: \"Reinforcement_Learning\",\n",
+    "    2: \"Genetic_Algorithms\",\n",
+    "    3: \"Neural_Networks\",\n",
+    "    4: \"Probabilistic_Methods\",\n",
+    "    5: \"Case_Based\",\n",
+    "    6: \"Rule_Learning\"}\n",
+    "data.y[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aspath = Path(\"../data/snap-as/as_edges.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.read_parquet(aspath)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parameters: \n",
+    "# 10 patches\n",
+    "# Average degree k=4\n",
+    "# Overlap between 256 and 1024\n",
+    "# Embedding dimension up to 128"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def hierarchical_cluster_and_embed(graph: nx.Graph, m: int, k: int) -> List[Tuple[nx.Graph, List[float]]]:\n",
+    "    if graph.number_of_nodes() <= m:\n",
+    "        return [(graph, embed(graph))]\n",
+    "    \n",
+    "    clusters = cluster(graph, k)\n",
+    "    results = []\n",
+    "    \n",
+    "    for subgraph in clusters:\n",
+    "        results.extend(hierarchical_cluster_and_embed(subgraph, m, k))\n",
+    "    \n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "edge_index = data.edge_index.numpy()\n",
     "print(edge_index.shape)"