Skip to content

Commit

Permalink
Merge pull request #129 from rllm-team/develop
Browse files Browse the repository at this point in the history
update trompt and format
  • Loading branch information
JianwuZheng413 authored Nov 24, 2024
2 parents d31dbe2 + 48c9dca commit 168e57e
Show file tree
Hide file tree
Showing 13 changed files with 299 additions and 141 deletions.
4 changes: 1 addition & 3 deletions examples/bridge/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from typing import Any, Dict, List, Optional, Callable, Type
from typing import Optional, Type

import pandas as pd
import torch
from torch import Tensor
from torch.nn import Module
import torch.nn.functional as F

from rllm.types import ColType
from rllm.data import GraphData
from rllm.transforms.table_transforms import FTTransformerTransform
from rllm.transforms.graph_transforms import GCNNorm
from rllm.nn.conv.table_conv import TabTransformerConv
from rllm.nn.conv.graph_conv import GCNConv
Expand Down
52 changes: 31 additions & 21 deletions examples/ft_transformer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# The Trompt method from the
# "Trompt: Towards a Better Deep Neural Network for Tabular Data" paper.
# ArXiv: https://arxiv.org/abs/1609.02907

# Datasets Titanic Adult
# Acc 0.780 0.859
# Time 9.8s 272.1s

import argparse
import os.path as osp
import sys
import time
from typing import Any, Dict, List

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import torch
from torch import Tensor
from torch.utils.data import DataLoader
Expand All @@ -18,13 +26,13 @@
from rllm.nn.conv.table_conv import FTTransformerConv

parser = argparse.ArgumentParser()
parser.add_argument("--dim", help="embedding dim.", type=int, default=32)
parser.add_argument("--dim", help="embedding dim", type=int, default=32)
parser.add_argument("--num_layers", type=int, default=3)
parser.add_argument("--batch_size", type=int, default=128)
parser.add_argument("--lr", type=float, default=0.001)
parser.add_argument("--epochs", type=int, default=50)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--wd", type=float, default=5e-4)
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--lr", type=float, default=1e-4)
parser.add_argument("--wd", type=float, default=1e-5)
parser.add_argument("--epochs", type=int, default=100)
parser.add_argument("--seed", type=int, default=0)
args = parser.parse_args()

torch.manual_seed(args.seed)
Expand Down Expand Up @@ -106,25 +114,24 @@ def train(epoch: int) -> float:
@torch.no_grad()
def test(loader: DataLoader) -> float:
model.eval()
all_preds = []
all_labels = []
correct = total = 0
for batch in loader:
x, y = batch
pred = model.forward(x)
all_labels.append(y.cpu())
all_preds.append(pred[:, 1].detach().cpu())
all_labels = torch.cat(all_labels).numpy()
all_preds = torch.cat(all_preds).numpy()

# Compute the overall AUC
overall_auc = roc_auc_score(all_labels, all_preds)
return overall_auc
feat_dict, y = batch
pred = model.forward(feat_dict)
_, predicted = torch.max(pred, 1)
total += y.size(0)
correct += (predicted == y).sum().item()
accuracy = correct / total
return accuracy


metric = "AUC"
metric = "Acc"
best_val_metric = 0
best_test_metric = 0
times = []
st = time.time()
for epoch in range(1, args.epochs + 1):
start = time.time()
train_loss = train(epoch)
train_metric = test(train_loader)
val_metric = test(val_loader)
Expand All @@ -134,12 +141,15 @@ def test(loader: DataLoader) -> float:
best_val_metric = val_metric
best_test_metric = test_metric

times.append(time.time() - start)
print(
f"Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, "
f"Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}"
)
optimizer.step()

et = time.time()
print(f"Mean time per epoch: {torch.tensor(times).mean():.4f}s")
print(f"Total time: {et-st}s")
print(
f"Best Val {metric}: {best_val_metric:.4f}, "
f"Best Test {metric}: {best_test_metric:.4f}"
Expand Down
45 changes: 28 additions & 17 deletions examples/tab_transformer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# The TabTransformer method from the
# "TabTransformer: Tabular Data Modeling Using Contextual Embeddings" paper.
# ArXiv: https://arxiv.org/abs/2012.06678

# Datasets Titanic Adult
# AUC 0.809 0.839
# Time 11.3s 391.1s

import argparse
import os.path as osp
import sys
import time
from typing import Any, Dict, List

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -108,40 +116,43 @@ def train(epoch: int) -> float:
@torch.no_grad()
def test(loader: DataLoader) -> float:
model.eval()
all_preds = []
all_labels = []
correct = total = 0
for batch in loader:
x, y = batch
pred = model.forward(x)
all_labels.append(y.cpu())
all_preds.append(pred[:, 1].detach().cpu())
all_labels = torch.cat(all_labels).numpy()
all_preds = torch.cat(all_preds).numpy()

overall_auc = roc_auc_score(all_labels, all_preds)
return overall_auc
feat_dict, y = batch
pred = model.forward(feat_dict)
_, predicted = torch.max(pred, 1)
total += y.size(0)
correct += (predicted == y).sum().item()
accuracy = correct / total
return accuracy


metric = "AUC"
metric = "Acc"
best_val_metric = 0
res_test_metric = 0
best_test_metric = 0
times = []
st = time.time()
for epoch in range(1, args.epochs + 1):
start = time.time()
train_loss = train(epoch)
train_metric = test(train_loader)
val_metric = test(val_loader)
test_metric = test(test_loader)

if val_metric > best_val_metric:
best_val_metric = val_metric
res_test_metric = test_metric
best_test_metric = test_metric

times.append(time.time() - start)
print(
f"Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, "
f"Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}"
)
optimizer.step()

et = time.time()
print(f"Mean time per epoch: {torch.tensor(times).mean():.4f}s")
print(f"Total time: {et-st}s")
print(
f"Best Val {metric}: {best_val_metric:.4f}, "
f"Best Test {metric}: {res_test_metric:.4f}"
f"Best Test {metric}: {best_test_metric:.4f}"
)
59 changes: 34 additions & 25 deletions examples/tabnet.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,52 @@
# The TabNet method from the
# "TabNet: Attentive Interpretable Tabular Learning" paper.
# ArXiv: https://arxiv.org/abs/1908.07442

# Datasets Titanic Adult
# Acc 0.843 0.853
# Time 31.1s 454.8s

import argparse
import os.path as osp
import sys
import time
from typing import Any, Dict, List

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

sys.path.append("./")
sys.path.append("../")
from rllm.types import ColType
from rllm.datasets.titanic import Titanic
from rllm.datasets.adult import Adult
from rllm.transforms.table_transforms import TabNetTransform
from rllm.nn.models import TabNet

parser = argparse.ArgumentParser()
parser.add_argument("--dim", help="embedding dim", type=int, default=32)
parser.add_argument("--batch_size", type=int, default=128)
parser.add_argument("--lr", type=float, default=0.001)
parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--wd", type=float, default=5e-4)
parser.add_argument("--epochs", type=int, default=50)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--wd", type=float, default=5e-4)
args = parser.parse_args()

torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare datasets
path = osp.join(osp.dirname(osp.realpath(__file__)), "..", "data")
dataset = Titanic(cached_dir=path)[0]
dataset = Adult(cached_dir=path)[0]
# dataset = Titanic(cached_dir=path)[0]
dataset.to(device)
dataset.shuffle()

# Split dataset, here the ratio of train-val-test is 80%-10%-10%
# Split dataset, here the ratio of train-val-test is 80%-10%-10%
train_loader, val_loader, test_loader = dataset.get_dataloader(
0.8, 0.1, 0.1, batch_size=args.batch_size
26048, 6513, 16281, batch_size=args.batch_size
)


Expand Down Expand Up @@ -99,28 +109,24 @@ def train(epoch: int, lambda_sparse: float = 1e-4) -> float:
@torch.no_grad()
def test(loader: DataLoader) -> float:
model.eval()
all_preds = []
all_labels = []
correct = total = 0
for batch in loader:
x, y = batch
pred, _ = model.forward(x)
all_labels.append(y.cpu())
all_preds.append(pred[:, 1].detach().cpu())
all_labels = torch.cat(all_labels).numpy()
all_preds = torch.cat(all_preds).numpy()
# if np.isnan(all_labels).any():
# print("NaN found in all_labels")
# if np.isnan(all_preds).any():
# print("NaN found in all_preds")
# Compute the overall AUC
overall_auc = roc_auc_score(all_labels, all_preds)
return overall_auc


metric = "AUC"
feat_dict, y = batch
pred, _ = model.forward(feat_dict)
_, predicted = torch.max(pred, 1)
total += y.size(0)
correct += (predicted == y).sum().item()
accuracy = correct / total
return accuracy


metric = "Acc"
best_val_metric = 0
best_test_metric = 0
times = []
st = time.time()
for epoch in range(1, args.epochs + 1):
start = time.time()
train_loss = train(epoch)
train_metric = test(train_loader)
val_metric = test(val_loader)
Expand All @@ -130,12 +136,15 @@ def test(loader: DataLoader) -> float:
best_val_metric = val_metric
best_test_metric = test_metric

times.append(time.time() - start)
print(
f"Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, "
f"Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}"
)
optimizer.step()

et = time.time()
print(f"Mean time per epoch: {torch.tensor(times).mean():.4f}s")
print(f"Total time: {et-st}s")
print(
f"Best Val {metric}: {best_val_metric:.4f}, "
f"Best Test {metric}: {best_test_metric:.4f}"
Expand Down
Loading

0 comments on commit 168e57e

Please sign in to comment.