-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path09_2_assignment4.py
91 lines (74 loc) · 2.57 KB
/
09_2_assignment4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Install xgboost from conda-forge
# https://xgboost.readthedocs.io/en/stable/install.html
# for CU supercomputer, install the gpu enabled version
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import xgboost as xgb
from plotnine import *
# Load presence-absence data for species "nz05"
nz05df = pd.read_csv("data/nz05.csv")
nz05df.head()
# Outline of New Zealand (header warning is harmless)
nzpoly = gpd.read_file("data/nzpoly.geojson")
type(nzpoly)
# Plot the records
nz05dfs = nz05df.sort_values("occ") #present on top
fig, ax = plt.subplots()
nzpoly.plot(ax=ax, color="lightgray")
scatter = ax.scatter(nz05dfs['x'], nz05dfs['y'], s=0.5, c=nz05dfs['occ'],
alpha=0.2)
legend1 = ax.legend(*scatter.legend_elements(), title="occ")
ax.add_artist(legend1)
plt.show()
# Data for modeling
nz05pa = nz05df.drop(columns=["group","siteid","x","y","toxicats"])
nz05pa.head()
# -- Example boosted model (5 secs)
# Prepare data for xgboost
nz05pa_xgb = xgb.DMatrix(data=nz05pa.drop(columns=["occ"]), label=nz05pa["occ"])
type(nz05pa_xgb)
nz05pa_xgb.feature_names
# Train
param = {"max_depth": 1, "eta": 0.01, "nthread": 2, "objective": "binary:logistic"}
nz05_train = xgb.train(param, nz05pa_xgb, num_boost_round=10000)
# Predict
nz05_prob = nz05_train.predict(nz05pa_xgb)
nz05_pred = 1 * (nz05_prob > 0.5)
type(nz05_pred)
# Characteristics of this prediction
plt.figure()
plt.hist(nz05_prob)
plt.show()
np.max(nz05_prob)
np.sum(nz05_prob > 0.5) #number of predicted presences
pd.crosstab(nz05_pred, nz05pa["occ"]) #confusion matrix
np.mean(nz05_pred == nz05pa["occ"]) #accuracy
np.mean(nz05_pred != nz05pa["occ"]) #error = 1 - accuracy
# -- Example prediction for a grid of the predictor variables across NZ
# Read in the grid of predictor variables
NZ_grid = pd.read_csv("data/NZ_predictors.csv")
NZ_grid.head()
# Prepare data for xgboost
NZ_grid_xgb = xgb.DMatrix(data=NZ_grid.drop(columns=["x","y"]))
NZ_grid_xgb.feature_names #matches the training data
#Predict
pred = NZ_grid[["x","y"]].copy()
pred["prob"] = nz05_train.predict(NZ_grid_xgb)
pred["present"] = 1 * (pred["prob"] > 0.5)
# Map probability prediction
plt.close()
(ggplot()
+ geom_raster(aes(x="x", y="y", fill="prob"), data=pred)
+ scale_fill_gradientn(colors=plt.cm.viridis.colors)
+ coord_equal()
+ theme_void()
+ labs(fill = "Probability"))
# Map presence prediction (not optimal but this works)
# Things that don't work: geom_tile; fill="factor(present)"
(ggplot()
+ geom_raster(aes(x="x", y="y", fill="present"), data=pred)
+ coord_equal()
+ theme_void()
+ labs(fill = "Present"))