Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DBSCAN Clustering Algorithm #1036

Merged
merged 2 commits into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
x1,x2
x,y
-7.849704268160314,-8.674808581953744
-8.69664575596503,-7.692212927859845
-1.0359915243020554,6.827713353515913
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
x1,x2
x,y
-10.718663580210745,9.900472871349784
-7.9399489058067685,10.718096194668659
-9.199206305612545,6.923022960440686
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
x1,x2
x,y
2.498895550686414,1.1683342316471301
5.073222450036444,0.8623713271387572
4.672790880324094,2.6418773621498803
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
x1,x2
x,y
-0.9914878645939103,0.12004373871846165
-0.8064234011393798,-0.5891152116668836
-0.4265373027932866,-0.6749570357883385
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
x1,x2
x,y
2.001164955520126,0.43029947571735944
-0.6744557747817002,0.7365153394030783
0.13616835175111489,1.0001429333017973
Expand Down
22 changes: 22 additions & 0 deletions examples/d3/DBSCAN/DBSCAN_Cluster/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<html>
<head>
<title>ml5.js DBSCAN example</title>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script src="http://localhost:8080/ml5.js" type="text/javascript"></script>
</head>
<body>
<center>
<h1>DBSCAN Example</h1>
<p style="width: 60%;"> DBSCAN is a <i>density-based clustering non-parametric algorithm</i>: given a set of points in some space, it groups together points that are closely packed together (points with many nearby neighbors),
marking as outliers points that lie alone in low-density regions (whose nearest neighbors are too far away).
DBSCAN is one of the most common clustering algorithms and also most cited in scientific literature.
</p>
<div>

<div id="buttons"></div>
<div id="chart"></div>
</div>
</center>
<script src="sketch.js"></script>
</body>
</html>
115 changes: 115 additions & 0 deletions examples/d3/DBSCAN/DBSCAN_Cluster/sketch.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
let dbscanModel;
const width = 640;
const height = 480;
const colDict = {
0: "skyblue",
1: "coral",
2: "olive",
3: "tan",
4: "grey",
};

// ----- Initialize the example: ------ //
function init() {
// make all those nice buttons
createButtons();
// start with 3 cluster
make(1.55);
}
init();

// STEP 1:
// create all the buttons
function createButtons() {
addClusterButton(1.55);
addClusterButton(1.56);
addClusterButton(2);
addClusterButton("circle");
addClusterButton("moon");
}

// STEP 2:
// create the model
function make(eps) {
const options = {
eps: eps,
minPts: 3,
};
// if moon or circle data, set the options to 0.1 and 0.16 eps
if (eps === "moon") {
options.eps = 0.1;
} else if (eps === "circle") {
options.eps = 0.16;
}
console.log(eps, options.eps);

// get the path to the data in our data folder dynamically
const dataPath = `data/gaussian2d_${eps}clusters.csv`;
// create a new dbscan clustering each time on make()
dbscanModel = ml5.dbscan(dataPath, options, modelReady);
}

// Step 3:
// when the model is ready, make the chart
function modelReady() {
console.log(dbscanModel.dataset);
makeChart();
}

// Step 4:
// use the fancy d3 to make magic
function makeChart() {
const { dataset } = dbscanModel;
// clear the chart each time
// less efficient, but simple
d3.select("svg").remove();

// reappend the svg to the chart area
const svg = d3
.select("#chart")
.append("svg")
.attr("width", width)
.attr("height", height);

// d[0] is for the x value in the array
const xScale = d3
.scaleLinear()
.domain(d3.extent(dataset, d => d[0]))
.range([10, width - 100]);

// d[1] is for the y value in the array
const yScale = d3
.scaleLinear()
.domain(d3.extent(dataset, d => d[1]))
.range([height - 50, 20]);

svg
.selectAll("circle")
.data(dataset)
.enter()
.append("circle")
.attr("cx", d => xScale(d[0]))
.attr("cy", d => yScale(d[1]))
.attr("r", 6)
.attr("fill", "black");

d3.select("svg")
.selectAll("circle")
.transition()
.attr("fill", (d, i) => colDict[dataset[i].clusterid]);
}

// adds the buttons for the respective cluster data
// we could also use d3.append() and d3.select() here :)
function addClusterButton(eps, minPts = 3) {
const btn = document.createElement("BUTTON");
btn.innerText = `cluster: ${eps} & minPts: ${minPts}`;

btn.addEventListener("click", function(e) {
make(eps);
});

document.querySelector("#buttons").appendChild(btn);

return btn;
}
2 changes: 1 addition & 1 deletion examples/examples.json

Large diffs are not rendered by default.

180 changes: 180 additions & 0 deletions src/DBSCAN/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// Copyright (c) 2020 ml5
//
// This software is released under the MIT License.
// https://opensource.org/licenses/MIT

/* eslint "no-param-reassign": [2, { "props": false }] */
/*
DBSCAN Algorithm (with Euclidian distance). Influenced By jDBSCAN
*/

import * as tf from "@tensorflow/tfjs";
import callCallback from "../utils/callcallback";

/**
* Read in a csv file from a path to its location.
* @param {string} path
*/
async function readCsv(path) {
const myCsv = tf.data.csv(path);
const loadedData = await myCsv.toArray();
return loadedData;
}

/**
* Load and flatten an array of arrays, an array of objects, or a string
* path to a csv.
* @param {string || array || object} inputData
*/
async function loadDataset(inputData) {
let data;
if (typeof inputData === "string") {
data = await readCsv(inputData);
} else {
data = inputData;
}
const dataFlat = data.map(d => {
return Object.values(d);
});
return dataFlat;
}

const DEFAULTS = {
eps: 50,
minPts: 3,
};

class DBSCAN {
/**
* Create a DBSCAN.
* @param {String || array || object} dataset - The dataset to cluster. in x, y format => [{x:1,y:2}]
* @param {options} options - An object describing a model's parameters:
* - eps: Minimum distance between neighbours
* - minPts: Minimum number of neighbours to count as a core point
* @param {function} callback - Optional. A callback to be called once
* the model has loaded. If no callback is provided, it will return a
* promise that will be resolved once the model has loaded.
*/

constructor(dataset, options, callback) {
this.config = {
eps: options.eps || DEFAULTS.eps,
minPts: options.minPts || DEFAULTS.minPts,
};
this.lastClusterId = 0;
this.status = [];
this.ready = callCallback(this.load(dataset), callback);
}

/**
* Load dataset, and run model.
* @param {string || array || object} dataset
*/
async load(dataset) {
this.dataset = await loadDataset(dataset);
tf.tidy(() => {
this.dataTensor = tf.tensor2d(this.dataset);
this.dataset.forEach(d => {
const tensors = tf.tensor1d(Object.values(d));
d.tensor = tensors;
});
this.fit();
});
return this;
}

/**
* Run DBSCAN algorithm.
*/
fit() {
this.dataset.forEach((d, idx) => {
if (d.status === undefined) {
d.status = 0; // initlize as a noise point
const neighboursIndices = this.getNeighboursIndices(d);
if (neighboursIndices.length < this.config.minPts) {
// Border or noise
d.status = 0;
} else {
this.incrementClusterId();
this.extend(idx, neighboursIndices);
}
}
});
}

/**
* Extend cluster by running algorithm on neighbours and detect neighbours that are core points as well
* @param {number} pointIndex
* @param {number[]} neighboursIndices
*/
extend(pointIndex, neighboursIndices) {
this.dataset[pointIndex].clusterid = this.getClusterId();
this.dataset[pointIndex].status = this.dataset[pointIndex].clusterid;
neighboursIndices.forEach(neighbourIndex => {
if (this.dataset[neighbourIndex].status === undefined) {
// Status unknown intialize as noise
this.dataset[neighbourIndex].status = 0;
const currNeighbours = this.getNeighboursIndices(
// Neighbours of this point
this.dataset[neighbourIndex],
);
const currNumNeighbours = currNeighbours.length;

if (currNumNeighbours >= this.config.minPts) {
// If Neighbours are above minimum we go further and add this and potential neighbours to clusterId
this.extend(neighbourIndex, currNeighbours);
}
}
if (this.dataset[neighbourIndex].status < 1) {
this.dataset[neighbourIndex].status = this.dataset[pointIndex].clusterid;
this.dataset[neighbourIndex].clusterid = this.dataset[pointIndex].clusterid;
}
});
}

/**
* Return last generated cluster id
*/
getClusterId() {
return this.lastClusterId;
}
/**
* increment cluster id
*/
incrementClusterId() {
this.lastClusterId += 1;
}

/**
* Find closest neighbours to each observation.
*/
getNeighboursIndices(point) {
try {
const neighbours = tf.tidy(() => {
const { values, indices } = tf
.squaredDifference(point.tensor, this.dataTensor)
.sum(1)
.sqrt()
.topk(this.dataTensor.shape[0], true);
return tf
.stack([values.asType("float32"), indices.asType("float32")], 1)
.arraySync()
.filter(v => {
return v[0] <= this.config.eps;
})
.reduce((prev, cur) => {
prev.push(cur[1]);
return prev;
}, []);
});
return neighbours || [];
} catch (error) {
console.log(`error ${error}`);
}
return [];
}
}

const dbscan = (dataset, options, callback) => new DBSCAN(dataset, options, callback);

export default dbscan;
33 changes: 33 additions & 0 deletions src/DBSCAN/index_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) 2019 ml5
//
// This software is released under the MIT License.
// https://opensource.org/licenses/MIT

const { dbscan } = ml5;

const DBSCAN_DEFAULTS = {
eps: 50,
minPts: 3,
};

describe("DBSCAN", () => {
let dbscanModel;
const dataurl =
"https://raw.githubusercontent.com/asvsfs/ml5-library/dbscan/examples/d3/DBSCAN/DBSCAN_Cluster/data/gaussian2d_1.55clusters.csv";

beforeAll(async () => {
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
dbscanModel = await dbscan(dataurl, DBSCAN_DEFAULTS, () => {});
await dbscanModel.load(dataurl);
});

it("Should create dbscan with all the defaults", async () => {
expect(dbscanModel.config.eps).toBe(DBSCAN_DEFAULTS.eps);
expect(dbscanModel.config.minPts).toBe(DBSCAN_DEFAULTS.minPts);
});

it("dbscanModel dataset : Should have length 300", async () => {
// await kmeansModel.load(dataurl)
expect(dbscanModel.dataset.length).toBe(300);
});
});
Loading