-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e2448e9
commit 48a9f66
Showing
32 changed files
with
986 additions
and
110 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
.vscode | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from importlib.metadata import PackageNotFoundError, version | ||
|
||
try: | ||
__version__ = version("pytorch-cortex") | ||
except PackageNotFoundError: | ||
__version__ = "unknown version" |
2 changes: 1 addition & 1 deletion
2
cortex/config/hydra/branch/default.yaml → ...nfig/hydra/branches/protein_property.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
binding: | ||
protein_property: | ||
_target_: cortex.model.branch.Conv1dBranch | ||
out_dim: 8 | ||
embed_dim: ${channel_dim} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# @package _global_ | ||
dtype: float | ||
seed: 0 # random seed, set to null to use random seed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# @package _global_ | ||
# Directories for loading and storing data | ||
ckpt_name: ${job_name} | ||
ckpt_file: ${ckpt_name}.pt | ||
ckpt_cfg: ${ckpt_name}.yaml | ||
save_ckpt: true | ||
|
||
data_dir: /home/stantos5/scratch/code/remote/prescient-github/cortex/temp | ||
project_name: cortex | ||
__version__: null | ||
exp_name: dry_run | ||
job_name: null | ||
timestamp: ${now:%Y-%m-%d_%H-%M-%S} | ||
log_dir: ${data_dir}/${exp_name}/${job_name}/${timestamp} # use this directory for local output | ||
wandb_mode: online | ||
wandb_host: https://api.wandb.ai | ||
warnings_filter: ignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# @package _global_ | ||
channel_dim: 128 | ||
embed_dim: 32 | ||
ensemble_size: 4 | ||
dropout_prob: 0.0 | ||
kernel_size: 5 | ||
pooling_type: mean |
10 changes: 5 additions & 5 deletions
10
cortex/config/hydra/root/default.yaml → cortex/config/hydra/roots/protein_seq.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
delta_g: | ||
_target_: cortex.task.RegressionTask | ||
input_map: | ||
protein_seq: ['tokenized_seq'] | ||
outcome_cols: ['foldx_total_energy'] | ||
corrupt_train_inputs: true | ||
corrupt_inference_inputs: false | ||
root_key: protein_seq | ||
nominal_label_var: 0.01 | ||
data_module: | ||
_target_: cortex.data.data_module.TaskDataModule | ||
_recursive_: false | ||
batch_size: ${fit.batch_size} | ||
balance_train_partition: null | ||
drop_last: true | ||
lengths: [1.0, 0.0] | ||
train_on_everything: false | ||
num_workers: ${num_workers} | ||
dataset_config: | ||
_target_: cortex.data.dataset.RedFluorescentProteinDataset | ||
root: ${dataset_root_dir} | ||
download: ${download_datasets} | ||
train: ??? |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
defaults: | ||
- general_settings: default | ||
- logging: default | ||
- model_globals: default | ||
- roots: [protein_seq] | ||
- trunk: default | ||
- branches: [protein_property] | ||
- tree: protein_model | ||
- tasks: | ||
- protein_property/delta_g | ||
- _self_ | ||
|
||
fit: | ||
batch_size: 32 | ||
|
||
trainer: | ||
_target_: lightning.Trainer | ||
accelerator: gpu | ||
max_epochs: 64 | ||
devices: 1 | ||
# devices: 8 | ||
# strategy: ddp | ||
num_sanity_val_steps: 1 | ||
|
||
|
||
tree: | ||
_recursive_: false | ||
fit_cfg: | ||
reinitialize_roots: true | ||
linear_probing: false | ||
weight_averaging: null | ||
optimizer: | ||
_target_: torch.optim.Adam | ||
lr: 5e-3 | ||
weight_decay: 0. | ||
betas: [0.99, 0.999] | ||
fused: false | ||
lr_scheduler: | ||
_target_: transformers.get_cosine_schedule_with_warmup | ||
num_warmup_steps: 10 | ||
num_training_steps: ${trainer.max_epochs} | ||
|
||
tasks: | ||
|
||
protein_property: | ||
delta_g: | ||
# ensemble_size: ${ensemble_size} | ||
ensemble_size: 1 | ||
|
||
train_on_everything: false | ||
linear_probing: false | ||
dataset_root_dir: /home/stantos5/scratch/datasets | ||
download_datasets: true | ||
num_workers: 2 | ||
|
||
ckpt_name: ${exp_name}_${job_name} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
_target_: cortex.model.tree.SequenceModelTree |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,10 @@ | ||
from ._dataframe_dataset import DataFrameDataset, ordered_dict_collator | ||
from ._data_frame_dataset import DataFrameDataset, ordered_dict_collator | ||
from ._rfp_dataset import RedFluorescentProteinDataset | ||
from ._transformed_dataset import TransformedDataset | ||
|
||
__all__ = [ | ||
"DataFrameDataset", | ||
"TransformedDataset", | ||
"RedFluorescentProteinDataset", | ||
"ordered_dict_collator", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import os | ||
from collections import OrderedDict | ||
from pathlib import Path | ||
from typing import Any, Optional, TypeVar, Union | ||
|
||
import pandas as pd | ||
from pandas import DataFrame | ||
from torch.utils.data import Dataset | ||
|
||
from cortex.io import download_and_extract_archive | ||
from cortex.transforms import Transform | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
class DataFrameDataset(Dataset): | ||
_data: DataFrame | ||
_name: str = "temp" | ||
_target: str = "data.csv" | ||
columns = None | ||
|
||
def __init__( | ||
self, | ||
root: Union[str, Path], | ||
*, | ||
download: bool = False, | ||
download_source: Optional[str] = None, | ||
dedup: bool = True, | ||
train: bool = True, | ||
random_seed: int = 0xDEADBEEF, | ||
**kwargs: Any, | ||
) -> None: | ||
""" | ||
:param root: Root directory where the dataset subdirectory exists or, | ||
if :attr:`download` is ``True``, the directory where the dataset | ||
subdirectory will be created and the dataset downloaded. | ||
""" | ||
if isinstance(root, str): | ||
root = Path(root).resolve() | ||
self._root = root | ||
|
||
path = self._root / self._name | ||
|
||
if os.path.exists(path / self._target): | ||
pass | ||
elif download: | ||
if download_source is None: | ||
raise ValueError("If `download` is `True`, `download_source` must be provided.") | ||
download_and_extract_archive( | ||
resource=download_source, | ||
source=path, | ||
destination=path, | ||
name=f"{self._name}.tar.gz", | ||
remove_archive=True, | ||
) | ||
else: | ||
raise ValueError( | ||
f"Dataset not found at {path}. " "If `download` is `True`, the dataset will be downloaded." | ||
) | ||
|
||
if self._target.endswith(".csv"): | ||
data = pd.read_csv(path / self._target, **kwargs) | ||
elif self._target.endswith(".parquet"): | ||
data = pd.read_parquet(path / self._target, **kwargs) | ||
else: | ||
raise ValueError(f"Unsupported file format: {self._target}") | ||
|
||
if self.columns is None: | ||
self.columns = list(data.columns) | ||
|
||
if dedup: | ||
data.drop_duplicates(inplace=True) | ||
|
||
# split data into train and test using random seed | ||
train_indices = data.sample(frac=0.8, random_state=random_seed).index | ||
test_indices = data.index.difference(train_indices) | ||
|
||
select_indices = train_indices if train else test_indices | ||
self._data = data.loc[select_indices].reset_index(drop=True) | ||
|
||
def __len__(self) -> int: | ||
return len(self._data) | ||
|
||
def _fetch_item(self, index) -> pd.DataFrame: | ||
# check if int or slice | ||
if isinstance(index, int): | ||
item = self._data.iloc[index : index + 1] | ||
else: | ||
item = self._data.iloc[index] | ||
return item | ||
|
||
def _format_item(self, item: pd.DataFrame) -> OrderedDict[str, Any]: | ||
if len(item) == 1: | ||
return OrderedDict([(c, item[c].iloc[0]) for c in self.columns]) | ||
return OrderedDict([(c, item[c]) for c in self.columns]) | ||
|
||
def __getitem__(self, index) -> OrderedDict[str, Any]: | ||
item = self._fetch_item(index) | ||
return self._format_item(item) | ||
|
||
|
||
def ordered_dict_collator( | ||
batch: list[OrderedDict[str, Any]], | ||
) -> OrderedDict[str, Any]: | ||
""" | ||
Collates a batch of OrderedDicts into a single OrderedDict. | ||
""" | ||
res = OrderedDict([(key, [item[key] for item in batch]) for key in batch[0].keys()]) | ||
res["batch_size"] = len(batch) | ||
return res |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.