Skip to content

Commit

Permalink
Major restructuring of the dataloading
Browse files Browse the repository at this point in the history
  • Loading branch information
lotzma committed Feb 17, 2025
1 parent 51b2c59 commit 8026380
Show file tree
Hide file tree
Showing 753 changed files with 11,970,656 additions and 154 deletions.
215 changes: 150 additions & 65 deletions examples/clustering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,40 +12,40 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import community as community_louvain\n",
"import collections\n",
"import numpy as np\n",
"from IPython.display import display\n",
"import polars as pl\n",
"import pandas as pd\n",
"import networkx as nx\n",
"import matplotlib.cm as cm\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from typing import List, Tuple\n",
"from pathlib import Path\n",
"import os\n",
"import torch\n",
"from torch import Tensor\n",
"from torch_geometric.utils import to_networkx\n",
"from torch_geometric.datasets import Planetoid\n",
"from torch_geometric.utils import coalesce\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"data_dir = \"./data\"\n",
"os.makedirs(data_dir, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#l2g related imports\n",
"from l2gv2.clustering import distributed_clustering\n",
"from l2gv2.network.tgraph import TGraph\n",
"from l2gv2.network.npgraph import NPGraph\n",
"from l2gv2.patch.patches import create_patch_data\n",
"from l2gv2.patch.clustering import hierarchical_clustering\n"
"from l2gv2.patch.clustering import louvain_clustering\n",
"from l2gv2.graphs.tgraph import TGraph\n",
"from l2gv2.graphs.npgraph import NPGraph\n",
"from l2gv2.patch.patches import create_patch_data\n"
]
},
{
Expand Down Expand Up @@ -78,75 +78,76 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from l2gv2.datasets import get_dataset"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"dataset = get_dataset(\"as-733\")"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph\n",
"Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index\n",
"Processing...\n",
"Done!\n"
"ename": "AttributeError",
"evalue": "'AS733Dataset' object has no attribute 'x'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[42], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset\u001b[38;5;241m.\u001b[39mx\n",
"File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch_geometric/data/in_memory_dataset.py:318\u001b[0m, in \u001b[0;36mInMemoryDataset.__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 315\u001b[0m data_list \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindices()]\n\u001b[1;32m 316\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Batch\u001b[38;5;241m.\u001b[39mfrom_data_list(data_list)[key]\n\u001b[0;32m--> 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mAttributeError\u001b[0m: 'AS733Dataset' object has no attribute 'x'"
]
}
],
"source": [
"dataset = Planetoid(root=data_dir, name='Cora')\n",
"data = dataset[0]"
"dataset.x"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from torch_geometric.data import InMemoryDataset, Dataset"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of nodes: 2708\n",
"Number of edges: 10556\n",
"Has isolated nodes: False\n",
"Has self-loops: False\n",
"Is undirected: True\n",
"Average node degree: 3.90\n"
"Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])\n"
]
}
],
"source": [
"print(f'Number of nodes: {data.num_nodes}')\n",
"print(f'Number of edges: {data.num_edges}')\n",
"print(f'Has isolated nodes: {data.has_isolated_nodes()}')\n",
"print(f'Has self-loops: {data.has_self_loops()}')\n",
"print(f'Is undirected: {data.is_undirected()}')\n",
"print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')"
"for d in dataset:\n",
" print(d)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m G \u001b[38;5;241m=\u001b[39m to_networkx(data, to_undirected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m degrees \u001b[38;5;241m=\u001b[39m [val \u001b[38;5;28;01mfor\u001b[39;00m (node, val) \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()]\n\u001b[0;32m----> 3\u001b[0m display(pd\u001b[38;5;241m.\u001b[39mDataFrame(pd\u001b[38;5;241m.\u001b[39mSeries(degrees)\u001b[38;5;241m.\u001b[39mdescribe())\u001b[38;5;241m.\u001b[39mtranspose()\u001b[38;5;241m.\u001b[39mround(\u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(degrees))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28msum\u001b[39m(degrees))\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"outputs": [],
"source": [
"G = to_networkx(data, to_undirected=True)\n",
"degrees = [val for (node, val) in G.degree()]\n",
Expand All @@ -161,17 +162,101 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2, 10556)\n"
]
}
],
"outputs": [],
"source": [
"G = to_networkx(data, to_undirected=True)\n",
"pos = nx.spring_layout(G, seed=42)\n",
"cent = nx.degree_centrality(G)\n",
"node_size = list(map(lambda x: x * 500, cent.values()))\n",
"cent_array = np.array(list(cent.values()))\n",
"threshold = sorted(cent_array, reverse=True)[10]\n",
"print(\"threshold\", threshold)\n",
"cent_bin = np.where(cent_array >= threshold, 1, 0.1)\n",
"plt.figure(figsize=(12, 12))\n",
"nodes = nx.draw_networkx_nodes(G, pos, node_size=node_size,\n",
" cmap=plt.cm.plasma,\n",
" node_color=cent_bin,\n",
" nodelist=list(cent.keys()),\n",
" alpha=cent_bin)\n",
"edges = nx.draw_networkx_edges(G, pos, width=0.25, alpha=0.3)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label_dict = {\n",
" 0: \"Theory\",\n",
" 1: \"Reinforcement_Learning\",\n",
" 2: \"Genetic_Algorithms\",\n",
" 3: \"Neural_Networks\",\n",
" 4: \"Probabilistic_Methods\",\n",
" 5: \"Case_Based\",\n",
" 6: \"Rule_Learning\"}\n",
"data.y[:10]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"aspath = Path(\"../data/snap-as/as_edges.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pl.read_parquet(aspath)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parameters: \n",
"# 10 patches\n",
"# Average degree k=4\n",
"# Overlap between 256 and 1024\n",
"# Embedding dimension up to 128"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def hierarchical_cluster_and_embed(graph: nx.Graph, m: int, k: int) -> List[Tuple[nx.Graph, List[float]]]:\n",
" if graph.number_of_nodes() <= m:\n",
" return [(graph, embed(graph))]\n",
" \n",
" clusters = cluster(graph, k)\n",
" results = []\n",
" \n",
" for subgraph in clusters:\n",
" results.extend(hierarchical_cluster_and_embed(subgraph, m, k))\n",
" \n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"edge_index = data.edge_index.numpy()\n",
"print(edge_index.shape)"
Expand Down
Loading

0 comments on commit 8026380

Please sign in to comment.