diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9fdd8d7..f861607 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: hooks: - id: flake8 args: - - --per-file-ignores=sdk/test/unit/conftest.py:E501 + - --per-file-ignores=sdk/test/unit/server/conftest.py:E501 # - repo: https://github.com/pre-commit/pre-commit-hooks # rev: v2.3.0 # hooks: diff --git a/data_models/workflow_templates/argo/argo_workflow_template_0.json b/data_models/workflow_templates/argo/argo_workflow_template_0.json index 6c526ea..0cdf672 100644 --- a/data_models/workflow_templates/argo/argo_workflow_template_0.json +++ b/data_models/workflow_templates/argo/argo_workflow_template_0.json @@ -1,163 +1 @@ -{ - "metadata": { - "name": "pipeline-test-artifact-pipeline-d5rzf", - "generate_name": "pipeline-test-artifact-pipeline-", - "namespace": "argo", - "uid": "310b62f6-95fb-418f-ab28-e7070b183979", - "resource_version": "9057", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "convert-to-artifact-0", - "template": "convert-to-artifact", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - } - ] - } - }, - { - "name": "show-artifact-0", - "template": "show-artifact", - "arguments": { - "artifacts": [ - { - "name": "a", - "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}" - } - ] - }, - "depends": "convert-to-artifact-0" - } - ] - } - }, - { - "name": "convert-to-artifact", - "inputs": { - "parameters": [ - { - "name": "a" - }, - { - "name": "a_art", - "default": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art" - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "show-artifact", - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "Param A" - } - ] - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-artifact-pipeline-jx7pb", "generate_name": "pipeline-test-artifact-pipeline-", "namespace": "argo", "uid": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92", "resource_version": "7515", "generation": 1, "creation_timestamp": "07/12/2024", "labels": {"workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}]}, "spec": {"templates": [{"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "Param A"}]}, "outputs": {"artifacts": [{"name": "b", "_from": "{{tasks.show-artifact-0.outputs.artifacts.b}}"}]}, "metadata": {}, "dag": {"tasks": [{"name": "convert-to-artifact-0", "template": "convert-to-artifact", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}]}}, {"name": "show-artifact-0", "template": "show-artifact", "arguments": {"artifacts": [{"name": "a", "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}"}]}, "depends": "convert-to-artifact-0"}]}}, {"name": "convert-to-artifact", "inputs": {"parameters": [{"name": "a"}, {"name": "a_art", "default": "null"}]}, "outputs": {"artifacts": [{"name": "a_art", "path": "a_art"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "show-artifact", "inputs": {"parameters": [{"name": "b", "default": "null"}], "artifacts": [{"name": "a", "path": "a"}]}, "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "a", "value": "Param A"}]}}} \ No newline at end of file diff --git a/data_models/workflow_templates/argo/argo_workflow_template_1.json b/data_models/workflow_templates/argo/argo_workflow_template_1.json index 3fe681c..c5d35d9 100644 --- a/data_models/workflow_templates/argo/argo_workflow_template_1.json +++ b/data_models/workflow_templates/argo/argo_workflow_template_1.json @@ -1,856 +1 @@ -{ - "metadata": { - "name": "pipeline-test-lightning-cpu-pipeline-c8drk", - "generate_name": "pipeline-test-lightning-cpu-pipeline-", - "namespace": "argo", - "uid": "3c2f201a-4764-4435-a71e-105f9a801897", - "resource_version": "13618", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-4", - "template": "lightning-ddp-4", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-5", - "template": "lightning-ddp-5", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-4", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-5", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "null" - } - ] - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-parameter-pipeline-c877j", "generate_name": "pipeline-test-parameter-pipeline-", "namespace": "argo", "uid": "d2715290-865d-4776-84c4-776632cd7159", "resource_version": "7640", "generation": 1, "creation_timestamp": "07/12/2024", "labels": {"workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}]}, "spec": {"templates": [{"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}"}}]}, "metadata": {}, "dag": {"tasks": [{"name": "a-plus-b-0", "template": "a-plus-b", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}, {"name": "b", "value": "{{inputs.parameters.b}}"}]}}, {"name": "a-plus-b-plus-2-0", "template": "a-plus-b-plus-2", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}"}, {"name": "b", "value": "2"}]}, "depends": "a-plus-b-0"}]}}, {"name": "a-plus-b", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "a-plus-b-plus-2", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}, {"name": "b", "value": "{{workflow.parameters.b}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}]}}} \ No newline at end of file diff --git a/data_models/workflow_templates/argo/argo_workflow_template_2.json b/data_models/workflow_templates/argo/argo_workflow_template_2.json index 83c6a48..ef998da 100644 --- a/data_models/workflow_templates/argo/argo_workflow_template_2.json +++ b/data_models/workflow_templates/argo/argo_workflow_template_2.json @@ -1,656 +1 @@ -{ - "metadata": { - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2", - "generate_name": "pipeline-test-lightning-gpu-pipeline-", - "namespace": "argo", - "uid": "4e9795a0-2052-4a53-baa6-b8ab55724f5a", - "resource_version": "16215", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "null" - } - ] - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-torch-cpu-pipeline-hgcxv", "generate_name": "pipeline-test-torch-cpu-pipeline-", "namespace": "argo", "uid": "9de5c132-b8d2-44c8-b52e-47bfa710b7df", "resource_version": "7951", "generation": 1, "creation_timestamp": "07/12/2024", "labels": {"workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}]}, "spec": {"templates": [{"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}, {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}}} \ No newline at end of file diff --git a/data_models/workflow_templates/argo/argo_workflow_template_3.json b/data_models/workflow_templates/argo/argo_workflow_template_3.json index fa2823a..6c6f7fd 100644 --- a/data_models/workflow_templates/argo/argo_workflow_template_3.json +++ b/data_models/workflow_templates/argo/argo_workflow_template_3.json @@ -1,199 +1 @@ -{ - "metadata": { - "name": "pipeline-test-parameter-pipeline-mhwgd", - "generate_name": "pipeline-test-parameter-pipeline-", - "namespace": "argo", - "uid": "eb9cff7d-b949-4aa9-9cf6-703b2a602128", - "resource_version": "9922", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "a-plus-b-0", - "template": "a-plus-b", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - }, - { - "name": "b", - "value": "{{workflow.parameters.b}}" - } - ] - } - }, - { - "name": "a-plus-b-plus-2-0", - "template": "a-plus-b-plus-2", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" - }, - { - "name": "b", - "value": "2" - } - ] - }, - "depends": "a-plus-b-0" - } - ] - } - }, - { - "name": "a-plus-b", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "a-plus-b-plus-2", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "1" - }, - { - "name": "b", - "value": "2" - } - ] - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-torch-gpu-pipeline-7c4zp", "generate_name": "pipeline-test-torch-gpu-pipeline-", "namespace": "argo", "uid": "612226a1-b40f-4f68-92c3-ea8a5d6b3995", "resource_version": "9578", "generation": 1, "creation_timestamp": "07/12/2024", "labels": {"workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}]}, "spec": {"templates": [{"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}, {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}}} \ No newline at end of file diff --git a/data_models/workflow_templates/argo/argo_workflow_template_4.json b/data_models/workflow_templates/argo/argo_workflow_template_4.json deleted file mode 100644 index 9dd2a8b..0000000 --- a/data_models/workflow_templates/argo/argo_workflow_template_4.json +++ /dev/null @@ -1,908 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-torch-cpu-pipeline-2n6rx", - "generate_name": "pipeline-test-torch-cpu-pipeline-", - "namespace": "argo", - "uid": "b683dc24-a496-4b97-ad67-2702ea0167a5", - "resource_version": "10167", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-4", - "template": "torch-ddp-4", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-5", - "template": "torch-ddp-5", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-4", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-5", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "null" - }, - { - "name": "n_seconds_sleep", - "value": "null" - } - ] - } - } -} \ No newline at end of file diff --git a/data_models/workflow_templates/argo/argo_workflow_template_5.json b/data_models/workflow_templates/argo/argo_workflow_template_5.json deleted file mode 100644 index bded13d..0000000 --- a/data_models/workflow_templates/argo/argo_workflow_template_5.json +++ /dev/null @@ -1,692 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-torch-gpu-pipeline-dcfq8", - "generate_name": "pipeline-test-torch-gpu-pipeline-", - "namespace": "argo", - "uid": "1527e48c-6646-4cc4-8a54-edd274467a44", - "resource_version": "11645", - "generation": 1, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - } - ] - }, - "spec": { - "templates": [ - { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "null" - }, - { - "name": "n_seconds_sleep", - "value": "null" - } - ] - } - } -} \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_0.json b/data_models/workflow_templates/hera/hera_workflow_template_0.json index 26c7467..3ab50b0 100644 --- a/data_models/workflow_templates/hera/hera_workflow_template_0.json +++ b/data_models/workflow_templates/hera/hera_workflow_template_0.json @@ -1,13 +1,6 @@ { - "api_version": null, - "kind": null, "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, + "creation_timestamp": "07/12/2024", "generate_name": "pipeline-test-artifact-pipeline-", "generation": 1, "labels": { @@ -20,301 +13,107 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-artifact-pipeline-d5rzf", + "name": "pipeline-test-artifact-pipeline-jx7pb", "namespace": "argo", - "owner_references": null, - "resource_version": "9057", - "self_link": null, - "uid": "310b62f6-95fb-418f-ab28-e7070b183979" + "resource_version": "7515", + "uid": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "Param A", - "value_from": null + "value": "Param A" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, + "entrypoint": "bettmensch-ai-outer-dag", "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, "dag": { - "fail_fast": null, - "target": null, "tasks": [ { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null + "value": "{{inputs.parameters.a}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, "name": "convert-to-artifact-0", - "on_exit": null, - "template": "convert-to-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "template": "convert-to-artifact" }, { "arguments": { "artifacts": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": null, - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "name": "a" } - ], - "parameters": null + ] }, - "continue_on": null, - "dependencies": null, "depends": "convert-to-artifact-0", - "hooks": null, - "inline": null, "name": "show-artifact-0", - "on_exit": null, - "template": "show-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "template": "show-artifact" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null + "parameters": [ + { + "name": "a", + "value": "Param A" + } + ] }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "artifacts": [ + { + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b" + } + ] + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "name": "a" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": null, - "value_from": null + "name": "a_art" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, + "metadata": {}, "name": "convert-to-artifact", - "node_selector": null, "outputs": { "artifacts": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "path": "a_art" } - ], - "exit_code": null, - "parameters": null, - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -325,114 +124,45 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { "artifacts": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "path": "a" } ], - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null + "parameters": [ + { + "default": "null", + "name": "b" + } + ] }, - "metrics": null, + "metadata": {}, "name": "show-artifact", - "node_selector": null, "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "artifacts": [ + { + "name": "b", + "path": "b" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -443,35 +173,31 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null + ] } } \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_1.json b/data_models/workflow_templates/hera/hera_workflow_template_1.json index 57e161f..d052fd2 100644 --- a/data_models/workflow_templates/hera/hera_workflow_template_1.json +++ b/data_models/workflow_templates/hera/hera_workflow_template_1.json @@ -1,14 +1,7 @@ { - "api_version": null, - "kind": null, "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-lightning-cpu-pipeline-", + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-parameter-pipeline-", "generation": 1, "labels": { "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" @@ -20,1747 +13,185 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-lightning-cpu-pipeline-c8drk", + "name": "pipeline-test-parameter-pipeline-c877j", "namespace": "argo", - "owner_references": null, - "resource_version": "13618", - "self_link": null, - "uid": "3c2f201a-4764-4435-a71e-105f9a801897" + "resource_version": "7640", + "uid": "d2715290-865d-4776-84c4-776632cd7159" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "null", - "value_from": null + "name": "a", + "value": "1" + }, + { + "name": "b", + "value": "2" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, + "entrypoint": "bettmensch-ai-outer-dag", "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, "dag": { - "fail_fast": null, - "target": null, "tasks": [ { "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-4", - "on_exit": null, - "template": "lightning-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ + "name": "a", + "value": "{{inputs.parameters.a}}" + }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null + "name": "b", + "value": "{{inputs.parameters.b}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-5", - "on_exit": null, - "template": "lightning-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "name": "a-plus-b-0", + "template": "a-plus-b" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" + }, + { + "name": "b", + "value": "2" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "name": "a", + "value": "1" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "b", + "value": "2" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "1", + "name": "a" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "sum" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, + "metadata": {}, + "name": "a-plus-b", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { - "cpu": "700m", - "memory": "1Gi" + "cpu": "100m", + "memory": "100Mi" }, "requests": { - "cpu": "700m", - "memory": "1Gi" + "cpu": "100m", + "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "metrics": null, - "name": "lightning-ddp-4", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null + "default": "1", + "name": "a" }, { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "sum" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "metrics": null, - "name": "lightning-ddp-5", - "node_selector": null, + "metadata": {}, + "name": "a-plus-b-plus-2", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -1771,35 +202,35 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null + ] } } \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_2.json b/data_models/workflow_templates/hera/hera_workflow_template_2.json index 150d8b0..9e57fca 100644 --- a/data_models/workflow_templates/hera/hera_workflow_template_2.json +++ b/data_models/workflow_templates/hera/hera_workflow_template_2.json @@ -1,14 +1,7 @@ { - "api_version": null, - "kind": null, "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-lightning-gpu-pipeline-", + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-torch-cpu-pipeline-", "generation": 1, "labels": { "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" @@ -20,1337 +13,377 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv", "namespace": "argo", - "owner_references": null, - "resource_version": "16215", - "self_link": null, - "uid": "4e9795a0-2052-4a53-baa6-b8ab55724f5a" + "resource_version": "7951", + "uid": "9de5c132-b8d2-44c8-b52e-47bfa710b7df" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "null", - "value_from": null + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, + "entrypoint": "bettmensch-ai-outer-dag", "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, "resource": { "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, "resource": { "action": "delete", - "failure_condition": null, "flags": [ "service", "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + ] + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, "dag": { - "fail_fast": null, - "target": null, "tasks": [ { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" }, { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, + "depends": "torch-ddp-0", "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "template": "show-duration-param" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" + }, + { + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", + "torch-job": "torch-ddp-0", "torch-node": "0" } }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "duration", - "value": null, "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], "env": [ { "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null + "value": "INFO" }, { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" }, { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" }, { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" }, { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" }, { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" } ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", "ports": [ { "container_port": 29200, - "host_ip": null, - "host_port": null, "name": "ddp", "protocol": "TCP" } ], - "readiness_probe": null, "resources": { "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "300Mi" }, "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "300Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "default": "100", + "name": "n_iter" }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" + "torch-job": "torch-ddp-0", + "torch-node": "1" } }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "duration", - "value": null, "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "duration" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], "env": [ { "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null + "value": "INFO" }, { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" }, { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" }, { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" }, { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" }, { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" }, { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" } ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "300Mi" }, "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "300Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "name": "a" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, + "metadata": {}, "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "outputs": {}, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -1361,35 +394,35 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null + ] } } \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_3.json b/data_models/workflow_templates/hera/hera_workflow_template_3.json index d00dd49..7a752b6 100644 --- a/data_models/workflow_templates/hera/hera_workflow_template_3.json +++ b/data_models/workflow_templates/hera/hera_workflow_template_3.json @@ -1,14 +1,7 @@ { - "api_version": null, - "kind": null, "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-parameter-pipeline-", + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-torch-gpu-pipeline-", "generation": 1, "labels": { "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" @@ -20,458 +13,395 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-parameter-pipeline-mhwgd", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp", "namespace": "argo", - "owner_references": null, - "resource_version": "9922", - "self_link": null, - "uid": "eb9cff7d-b949-4aa9-9cf6-703b2a602128" + "resource_version": "9578", + "uid": "612226a1-b40f-4f68-92c3-ea8a5d6b3995" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "1", - "value_from": null + "name": "n_iter" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null + "name": "n_seconds_sleep" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, + "entrypoint": "bettmensch-ai-outer-dag", "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } + }, + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo" + ] + } + }, + { "dag": { - "fail_fast": null, - "target": null, "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" + }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "a-plus-b-0", - "on_exit": null, - "template": "a-plus-b", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" + } + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "a-plus-b-0", - "hooks": null, - "inline": null, - "name": "a-plus-b-plus-2-0", - "on_exit": null, - "template": "a-plus-b-plus-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0" + } }, - "metrics": null, - "name": "a-plus-b", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, + "name": "duration", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP" + } + ], "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1" + } }, - "metrics": null, - "name": "a-plus-b-plus-2", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, + "name": "duration", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" + }, + "script": { + "command": [ + "python" ], - "result": null + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] + }, + { + "inputs": { + "parameters": [ + { + "name": "a" + } + ] + }, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -482,35 +412,35 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null + ] } } \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_4.json b/data_models/workflow_templates/hera/hera_workflow_template_4.json deleted file mode 100644 index a5d9664..0000000 --- a/data_models/workflow_templates/hera/hera_workflow_template_4.json +++ /dev/null @@ -1,1922 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-torch-cpu-pipeline-", - "generation": 1, - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-torch-cpu-pipeline-2n6rx", - "namespace": "argo", - "owner_references": null, - "resource_version": "10167", - "self_link": null, - "uid": "b683dc24-a496-4b97-ad67-2702ea0167a5" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "null", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "null", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-4", - "on_exit": null, - "template": "torch-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-5", - "on_exit": null, - "template": "torch-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "metrics": null, - "name": "torch-ddp-4", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "metrics": null, - "name": "torch-ddp-5", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null - } -} \ No newline at end of file diff --git a/data_models/workflow_templates/hera/hera_workflow_template_5.json b/data_models/workflow_templates/hera/hera_workflow_template_5.json deleted file mode 100644 index 35ff48f..0000000 --- a/data_models/workflow_templates/hera/hera_workflow_template_5.json +++ /dev/null @@ -1,1476 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": null, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-torch-gpu-pipeline-", - "generation": 1, - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-torch-gpu-pipeline-dcfq8", - "namespace": "argo", - "owner_references": null, - "resource_version": "11645", - "self_link": null, - "uid": "1527e48c-6646-4cc4-8a54-edd274467a44" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "null", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "null", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": null - } -} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_0.json b/data_models/workflows/argo/argo_workflow_0.json index 822a6d6..acbac65 100644 --- a/data_models/workflows/argo/argo_workflow_0.json +++ b/data_models/workflows/argo/argo_workflow_0.json @@ -1,670 +1 @@ -{ - "metadata": { - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "generate_name": "pipeline-test-parameter-pipeline-mhwgd-flow-", - "namespace": "argo", - "uid": "ddfe31ae-1231-4a2d-be6c-4b712bcc15a6", - "resource_version": "18503", - "generation": 6, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "a", - "value": "-10" - }, - { - "name": "b", - "value": "20" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "nodes": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "type": "DAG", - "display_name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 47 - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729" - ], - "outbound_nodes": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" - ] - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0(0)", - "type": "Pod", - "display_name": "a-plus-b-plus-2-0(0)", - "template_name": "a-plus-b-plus-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "10" - }, - { - "name": "b", - "default": "2", - "value": "2" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "12", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0", - "type": "Retry", - "display_name": "a-plus-b-plus-2-0", - "template_name": "a-plus-b-plus-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "10" - }, - { - "name": "b", - "default": "2", - "value": "2" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "12", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" - ] - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0(0)", - "type": "Pod", - "display_name": "a-plus-b-0(0)", - "template_name": "a-plus-b", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "-10" - }, - { - "name": "b", - "default": "2", - "value": "20" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "10", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0", - "type": "Retry", - "display_name": "a-plus-b-0", - "template_name": "a-plus-b", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 47 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "-10" - }, - { - "name": "b", - "default": "2", - "value": "20" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "10", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680" - ] - } - }, - "stored_templates": { - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b": { - "name": "a-plus-b", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b-plus-2": { - "name": "a-plus-b-plus-2", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "a-plus-b-0", - "template": "a-plus-b", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - }, - { - "name": "b", - "value": "{{workflow.parameters.b}}" - } - ] - } - }, - { - "name": "a-plus-b-plus-2-0", - "template": "a-plus-b-plus-2", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" - }, - { - "name": "b", - "value": "2" - } - ] - }, - "depends": "a-plus-b-0" - } - ] - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 2, - "memory": 47 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "a-plus-b-0", - "template": "a-plus-b", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - }, - { - "name": "b", - "value": "{{workflow.parameters.b}}" - } - ] - } - }, - { - "name": "a-plus-b-plus-2-0", - "template": "a-plus-b-plus-2", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" - }, - { - "name": "b", - "value": "2" - } - ] - }, - "depends": "a-plus-b-0" - } - ] - } - }, - { - "name": "a-plus-b", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "a-plus-b-plus-2", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "-10" - }, - { - "name": "b", - "value": "20" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": true, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": true - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "generate_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-", "namespace": "argo", "uid": "ae69b1e3-a235-44d5-8667-bef63fc15821", "resource_version": "11463", "generation": 13, "creation_timestamp": "07/12/2024", "labels": {"bettmensch.ai/pipeline-id": "612226a1-b40f-4f68-92c3-ea8a5d6b3995", "bettmensch.ai/pipeline-name": "pipeline-test-torch-gpu-pipeline-7c4zp", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded"}, "annotations": {"karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:bettmensch.ai/pipeline-id": {}, "f:bettmensch.ai/pipeline-name": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}, {"manager": "workflow-controller", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:annotations": {".": {}, "f:karpenter.sh/do-not-disrupt": {}, "f:workflows.argoproj.io/pod-name-format": {}}, "f:labels": {"f:workflows.argoproj.io/completed": {}, "f:workflows.argoproj.io/phase": {}}}, "f:status": {}}}]}, "spec": {"arguments": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "workflow_template_ref": {"name": "pipeline-test-torch-gpu-pipeline-7c4zp"}}, "status": {"phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "nodes": {"pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "type": "DAG", "display_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "resources_duration": {"cpu": 23, "memory": 1644, "nvidia.com/gpu": 190}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060"], "outbound_nodes": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231"]}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-delete-torch-ddp-service", "type": "Pod", "display_name": "torch-ddp-delete-torch-ddp-service", "template_name": "torch-ddp-delete-torch-ddp-service", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 0, "memory": 0}, "outputs": {"exit_code": "0"}, "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal"}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0(0)", "type": "Pod", "display_name": "torch-ddp-0(0)", "template_name": "torch-ddp-0", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 11, "memory": 839, "nvidia.com/gpu": 99}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231"], "host_node_name": "ip-10-0-50-210.us-east-2.compute.internal"}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-create-torch-ddp-service", "type": "Pod", "display_name": "torch-ddp-create-torch-ddp-service", "template_name": "torch-ddp-create-torch-ddp-service", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 0, "memory": 1}, "outputs": {"exit_code": "0"}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059"], "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0", "type": "Retry", "display_name": "show-duration-param-0", "template_name": "show-duration-param", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 27}, "inputs": {"parameters": [{"name": "a", "value": "30"}]}, "outputs": {"exit_code": "0"}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694"]}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1", "type": "Retry", "display_name": "torch-ddp-0-worker-1", "template_name": "torch-ddp-1", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 11, "memory": 777, "nvidia.com/gpu": 91}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430"]}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag", "type": "DAG", "display_name": "bettmensch-ai-inner-dag", "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "resources_duration": {"cpu": 23, "memory": 1644, "nvidia.com/gpu": 190}, "inputs": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252"], "outbound_nodes": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231"]}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1(0)", "type": "Pod", "display_name": "torch-ddp-0-worker-1(0)", "template_name": "torch-ddp-1", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 11, "memory": 777, "nvidia.com/gpu": 91}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "host_node_name": "ip-10-0-50-218.us-east-2.compute.internal"}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0", "type": "Retry", "display_name": "torch-ddp-0", "template_name": "torch-ddp-0", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "3/3", "resources_duration": {"cpu": 12, "memory": 866, "nvidia.com/gpu": 99}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387"]}, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": {"id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0(0)", "type": "Pod", "display_name": "show-duration-param-0(0)", "template_name": "show-duration-param", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 27}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "a", "value": "30"}]}, "outputs": {"exit_code": "0"}, "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}}, "stored_templates": {"namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-inner-dag": {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-outer-dag": {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/show-duration-param": {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-0": {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-1": {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-create-torch-ddp-service": {"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-delete-torch-ddp-service": {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}}, "conditions": [{"type": "PodRunning", "status": "False"}, {"type": "Completed", "status": "True"}], "resources_duration": {"cpu": 23, "memory": 1644, "nvidia.com/gpu": 190}, "stored_workflow_template_spec": {"templates": [{"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}, {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}, "requests": {"cpu": "100m", "memory": "700Mi", "nvidia.com/gpu": "1"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}], "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "service_account_name": "argo-workflow", "workflow_template_ref": {"name": "pipeline-test-torch-gpu-pipeline-7c4zp"}}, "artifact_repository_ref": {"config_map": "artifact-repositories", "key": "bettmensch-ai-artifact-repository", "namespace": "argo", "artifact_repository": {"s3": {"endpoint": "s3.us-east-2.amazonaws.com", "bucket": "bettmensch-ai-artifact-repository", "insecure": true, "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}"}}}, "artifact_gc_status": {"not_specified": true}, "task_results_completion_status": {"pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": true, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": true, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": true, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": true, "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": true}}} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_1.json b/data_models/workflows/argo/argo_workflow_1.json index 9b56dcf..0ea39fe 100644 --- a/data_models/workflows/argo/argo_workflow_1.json +++ b/data_models/workflows/argo/argo_workflow_1.json @@ -1,552 +1 @@ -{ - "metadata": { - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "generate_name": "pipeline-test-artifact-pipeline-d5rzf-flow-", - "namespace": "argo", - "uid": "dc477fa6-dd12-43b7-8511-e3dc03bf023c", - "resource_version": "18180", - "generation": 6, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "a", - "value": "Second integration test value a" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "nodes": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "type": "DAG", - "display_name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 48 - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393" - ], - "outbound_nodes": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" - ] - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0", - "type": "Retry", - "display_name": "show-artifact-0", - "template_name": "show-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz" - } - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" - ] - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0(0)", - "type": "Pod", - "display_name": "convert-to-artifact-0(0)", - "template_name": "convert-to-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "Second integration test value a" - }, - { - "name": "a_art", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0(0)", - "type": "Pod", - "display_name": "show-artifact-0(0)", - "template_name": "show-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz" - } - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0", - "type": "Retry", - "display_name": "convert-to-artifact-0", - "template_name": "convert-to-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 48 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "Second integration test value a" - }, - { - "name": "a_art", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056" - ] - } - }, - "stored_templates": { - "namespaced/pipeline-test-artifact-pipeline-d5rzf/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "convert-to-artifact-0", - "template": "convert-to-artifact", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - } - ] - } - }, - { - "name": "show-artifact-0", - "template": "show-artifact", - "arguments": { - "artifacts": [ - { - "name": "a", - "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}" - } - ] - }, - "depends": "convert-to-artifact-0" - } - ] - } - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/convert-to-artifact": { - "name": "convert-to-artifact", - "inputs": { - "parameters": [ - { - "name": "a" - }, - { - "name": "a_art", - "default": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art" - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/show-artifact": { - "name": "show-artifact", - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 2, - "memory": 48 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "convert-to-artifact-0", - "template": "convert-to-artifact", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - } - ] - } - }, - { - "name": "show-artifact-0", - "template": "show-artifact", - "arguments": { - "artifacts": [ - { - "name": "a", - "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}" - } - ] - }, - "depends": "convert-to-artifact-0" - } - ] - } - }, - { - "name": "convert-to-artifact", - "inputs": { - "parameters": [ - { - "name": "a" - }, - { - "name": "a_art", - "default": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art" - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "show-artifact", - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "Second integration test value a" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": true, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": true - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "generate_name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-", "namespace": "argo", "uid": "15d5987d-9e1c-4606-82a0-611e6c7b19ee", "resource_version": "9564", "generation": 12, "creation_timestamp": "07/12/2024", "labels": {"bettmensch.ai/pipeline-id": "9de5c132-b8d2-44c8-b52e-47bfa710b7df", "bettmensch.ai/pipeline-name": "pipeline-test-torch-cpu-pipeline-hgcxv", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded"}, "annotations": {"karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:bettmensch.ai/pipeline-id": {}, "f:bettmensch.ai/pipeline-name": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}, {"manager": "workflow-controller", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:annotations": {".": {}, "f:karpenter.sh/do-not-disrupt": {}, "f:workflows.argoproj.io/pod-name-format": {}}, "f:labels": {"f:workflows.argoproj.io/completed": {}, "f:workflows.argoproj.io/phase": {}}}, "f:status": {}}}]}, "spec": {"arguments": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "workflow_template_ref": {"name": "pipeline-test-torch-cpu-pipeline-hgcxv"}}, "status": {"phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "nodes": {"pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "type": "DAG", "display_name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "resources_duration": {"cpu": 26, "memory": 1054}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078"], "outbound_nodes": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825"]}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0-worker-1(0)", "type": "Pod", "display_name": "torch-ddp-0-worker-1(0)", "template_name": "torch-ddp-1", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 12, "memory": 496}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal"}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-delete-torch-ddp-service", "type": "Pod", "display_name": "torch-ddp-delete-torch-ddp-service", "template_name": "torch-ddp-delete-torch-ddp-service", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 0, "memory": 0}, "outputs": {"exit_code": "0"}, "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-create-torch-ddp-service", "type": "Pod", "display_name": "torch-ddp-create-torch-ddp-service", "template_name": "torch-ddp-create-torch-ddp-service", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 0, "memory": 1}, "outputs": {"exit_code": "0"}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069"], "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0", "type": "Retry", "display_name": "torch-ddp-0", "template_name": "torch-ddp-0", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "3/3", "resources_duration": {"cpu": 14, "memory": 557}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249"]}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0(0)", "type": "Pod", "display_name": "torch-ddp-0(0)", "template_name": "torch-ddp-0", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 13, "memory": 527}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825"], "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag", "type": "DAG", "display_name": "bettmensch-ai-inner-dag", "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "5/5", "resources_duration": {"cpu": 26, "memory": 1054}, "inputs": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150"], "outbound_nodes": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825"]}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0-worker-1", "type": "Retry", "display_name": "torch-ddp-0-worker-1", "template_name": "torch-ddp-1", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 12, "memory": 496}, "inputs": {"parameters": [{"name": "n_iter", "default": "100", "value": "15"}, {"name": "n_seconds_sleep", "default": "10", "value": "2"}, {"name": "duration", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value": "30", "value_from": {"path": "duration"}}], "exit_code": "0"}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172"]}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.show-duration-param-0", "type": "Retry", "display_name": "show-duration-param-0", "template_name": "show-duration-param", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 30}, "inputs": {"parameters": [{"name": "a", "value": "30"}]}, "outputs": {"exit_code": "0"}, "children": ["pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508"]}, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508": {"id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.show-duration-param-0(0)", "type": "Pod", "display_name": "show-duration-param-0(0)", "template_name": "show-duration-param", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 30}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "a", "value": "30"}]}, "outputs": {"exit_code": "0"}, "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal"}}, "stored_templates": {"namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/bettmensch-ai-inner-dag": {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/bettmensch-ai-outer-dag": {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/show-duration-param": {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-0": {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-1": {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-create-torch-ddp-service": {"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-delete-torch-ddp-service": {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}}, "conditions": [{"type": "PodRunning", "status": "False"}, {"type": "Completed", "status": "True"}], "resources_duration": {"cpu": 26, "memory": 1054}, "stored_workflow_template_spec": {"templates": [{"name": "torch-ddp-create-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "create", "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n"}}, {"name": "torch-ddp-delete-torch-ddp-service", "inputs": {}, "outputs": {}, "metadata": {}, "resource": {"action": "delete", "flags": ["service", "--selector", "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", "-n", "argo"]}}, {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}]}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "torch-ddp-create-torch-ddp-service", "template": "torch-ddp-create-torch-ddp-service", "arguments": {}}, {"name": "torch-ddp-0", "template": "torch-ddp-0", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-0-worker-1", "template": "torch-ddp-1", "arguments": {"parameters": [{"name": "n_iter", "value": "{{inputs.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{inputs.parameters.n_seconds_sleep}}"}]}, "depends": "torch-ddp-create-torch-ddp-service"}, {"name": "torch-ddp-delete-torch-ddp-service", "template": "torch-ddp-delete-torch-ddp-service", "arguments": {}, "depends": "torch-ddp-0"}, {"name": "show-duration-param-0", "template": "show-duration-param", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}"}]}, "depends": "torch-ddp-0"}]}}, {"name": "torch-ddp-0", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "0"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "ports": [{"container_port": 29200, "name": "ddp", "protocol": "TCP"}], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "0"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "torch-ddp-1", "inputs": {"parameters": [{"name": "n_iter", "default": "100"}, {"name": "n_seconds_sleep", "default": "10"}, {"name": "duration", "default": "null"}]}, "outputs": {"parameters": [{"name": "duration", "value_from": {"path": "duration"}}]}, "metadata": {"labels": {"torch-job": "torch-ddp-0", "torch-node": "1"}}, "script": {"image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", "name": "", "command": ["python"], "env": [{"name": "NCCL_DEBUG", "value": "INFO"}, {"name": "bettmensch_ai_torch_ddp_min_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_max_nodes", "value": "2"}, {"name": "bettmensch_ai_torch_ddp_node_rank", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_nproc_per_node", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_max_restarts", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_start_method", "value": "fork"}, {"name": "bettmensch_ai_torch_ddp_rdzv_backend", "value": "static"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local"}, {"name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", "value": "29200"}, {"name": "bettmensch_ai_torch_ddp_run_id", "value": "1"}, {"name": "bettmensch_ai_torch_ddp_tee", "value": "0"}], "resources": {"limits": {"cpu": "100m", "memory": "300Mi"}, "requests": {"cpu": "100m", "memory": "300Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}, "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}"}, {"name": "show-duration-param", "inputs": {"parameters": [{"name": "a"}]}, "outputs": {}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "{{workflow.parameters.n_iter}}"}, {"name": "n_seconds_sleep", "value": "{{workflow.parameters.n_seconds_sleep}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "n_iter", "value": "15"}, {"name": "n_seconds_sleep", "value": "2"}]}, "service_account_name": "argo-workflow", "workflow_template_ref": {"name": "pipeline-test-torch-cpu-pipeline-hgcxv"}}, "artifact_repository_ref": {"config_map": "artifact-repositories", "key": "bettmensch-ai-artifact-repository", "namespace": "argo", "artifact_repository": {"s3": {"endpoint": "s3.us-east-2.amazonaws.com", "bucket": "bettmensch-ai-artifact-repository", "insecure": true, "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}"}}}, "artifact_gc_status": {"not_specified": true}, "task_results_completion_status": {"pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172": true, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825": true, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150": true, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249": true, "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508": true}}} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_2.json b/data_models/workflows/argo/argo_workflow_2.json index 72ffa9d..9bfb13f 100644 --- a/data_models/workflows/argo/argo_workflow_2.json +++ b/data_models/workflows/argo/argo_workflow_2.json @@ -1,1881 +1 @@ -{ - "metadata": { - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "generate_name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-", - "namespace": "argo", - "uid": "d48f4d8d-61b1-4b86-a200-49c525c6f516", - "resource_version": "17861", - "generation": 12, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "00:00:00:20" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "7/7", - "nodes": { - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "type": "DAG", - "display_name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "7/7", - "resources_duration": { - "cpu": 128, - "memory": 2228, - "nvidia.com/gpu": 179 - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919" - ], - "outbound_nodes": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-1(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-1(0)", - "template_name": "lightning-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 34, - "memory": 587, - "nvidia.com/gpu": 48 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:23.332028", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-49-51.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0(0)", - "type": "Pod", - "display_name": "lightning-ddp-0(0)", - "template_name": "lightning-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 26, - "memory": 467, - "nvidia.com/gpu": 37 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:23.295598", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645" - ], - "host_node_name": "ip-10-0-49-145.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.show-duration-param-0(0)", - "type": "Pod", - "display_name": "show-duration-param-0(0)", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "0:00:23.295598" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0", - "type": "Retry", - "display_name": "lightning-ddp-0", - "template_name": "lightning-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "3/3", - "resources_duration": { - "cpu": 27, - "memory": 491, - "nvidia.com/gpu": 37 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:23.295598", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-delete-torch-service", - "type": "Pod", - "display_name": "lightning-ddp-delete-torch-service", - "template_name": "lightning-ddp-delete-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.show-duration-param-0", - "type": "Retry", - "display_name": "show-duration-param-0", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "0:00:23.295598" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-3(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-3(0)", - "template_name": "lightning-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 606, - "nvidia.com/gpu": 50 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:22.990339", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-29.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-2(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-2(0)", - "template_name": "lightning-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 31, - "memory": 544, - "nvidia.com/gpu": 44 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:22.838134", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-166.us-east-2.compute.internal" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-2", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-2", - "template_name": "lightning-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 31, - "memory": 544, - "nvidia.com/gpu": 44 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:22.838134", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-3", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-3", - "template_name": "lightning-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 606, - "nvidia.com/gpu": 50 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:22.990339", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-1", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-1", - "template_name": "lightning-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 34, - "memory": 587, - "nvidia.com/gpu": 48 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:23.332028", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660" - ] - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919": { - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919", - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-create-torch-service", - "type": "Pod", - "display_name": "lightning-ddp-create-torch-service", - "template_name": "lightning-ddp-create-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - } - }, - "stored_templates": { - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-0": { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-1": { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-2": { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-3": { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-create-torch-service": { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-delete-torch-service": { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "-n", - "argo" - ] - } - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/show-duration-param": { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 128, - "memory": 2228, - "nvidia.com/gpu": 179 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "00:00:00:20" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919": true - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "generate_name": "pipeline-test-parameter-pipeline-c877j-flow-", "namespace": "argo", "uid": "f4623367-e5c2-4ba7-9a7a-633c55314421", "resource_version": "8018", "generation": 7, "creation_timestamp": "07/12/2024", "labels": {"bettmensch.ai/pipeline-id": "d2715290-865d-4776-84c4-776632cd7159", "bettmensch.ai/pipeline-name": "pipeline-test-parameter-pipeline-c877j", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded"}, "annotations": {"karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:bettmensch.ai/pipeline-id": {}, "f:bettmensch.ai/pipeline-name": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}, {"manager": "workflow-controller", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:annotations": {".": {}, "f:karpenter.sh/do-not-disrupt": {}, "f:workflows.argoproj.io/pod-name-format": {}}, "f:labels": {"f:workflows.argoproj.io/completed": {}, "f:workflows.argoproj.io/phase": {}}}, "f:status": {}}}]}, "spec": {"arguments": {"parameters": [{"name": "a", "value": "-100"}, {"name": "b", "value": "100"}]}, "workflow_template_ref": {"name": "pipeline-test-parameter-pipeline-c877j"}}, "status": {"phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "nodes": {"pipeline-test-parameter-pipeline-c877j-flow-tfgmn": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "type": "DAG", "display_name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 54}, "children": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891"], "outbound_nodes": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770"]}, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag", "type": "DAG", "display_name": "bettmensch-ai-inner-dag", "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 54}, "inputs": {"parameters": [{"name": "a", "value": "-100"}, {"name": "b", "value": "100"}]}, "outputs": {"parameters": [{"name": "sum", "value": "2"}]}, "children": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323"], "outbound_nodes": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770"]}, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0(0)", "type": "Pod", "display_name": "a-plus-b-0(0)", "template_name": "a-plus-b", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 28}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "a", "default": "1", "value": "-100"}, {"name": "b", "default": "2", "value": "100"}, {"name": "sum", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value": "0", "value_from": {"path": "sum"}}], "exit_code": "0"}, "children": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911"], "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0", "type": "Retry", "display_name": "a-plus-b-plus-2-0", "template_name": "a-plus-b-plus-2", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 26}, "inputs": {"parameters": [{"name": "a", "default": "1", "value": "0"}, {"name": "b", "default": "2", "value": "2"}, {"name": "sum", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value": "2", "value_from": {"path": "sum"}}], "exit_code": "0"}, "children": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770"]}, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0", "type": "Retry", "display_name": "a-plus-b-0", "template_name": "a-plus-b", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 54}, "inputs": {"parameters": [{"name": "a", "default": "1", "value": "-100"}, {"name": "b", "default": "2", "value": "100"}, {"name": "sum", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value": "0", "value_from": {"path": "sum"}}], "exit_code": "0"}, "children": ["pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278"]}, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": {"id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770", "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0(0)", "type": "Pod", "display_name": "a-plus-b-plus-2-0(0)", "template_name": "a-plus-b-plus-2", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 26}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "a", "default": "1", "value": "0"}, {"name": "b", "default": "2", "value": "2"}, {"name": "sum", "default": "null", "value": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value": "2", "value_from": {"path": "sum"}}], "exit_code": "0"}, "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal"}}, "stored_templates": {"namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b": {"name": "a-plus-b", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, "namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b-plus-2": {"name": "a-plus-b-plus-2", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-inner-dag": {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}"}}]}, "metadata": {}, "dag": {"tasks": [{"name": "a-plus-b-0", "template": "a-plus-b", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}, {"name": "b", "value": "{{inputs.parameters.b}}"}]}}, {"name": "a-plus-b-plus-2-0", "template": "a-plus-b-plus-2", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}"}, {"name": "b", "value": "2"}]}, "depends": "a-plus-b-0"}]}}, "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-outer-dag": {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}, {"name": "b", "value": "{{workflow.parameters.b}}"}]}}]}}}, "conditions": [{"type": "PodRunning", "status": "False"}, {"type": "Completed", "status": "True"}], "resources_duration": {"cpu": 2, "memory": 54}, "stored_workflow_template_spec": {"templates": [{"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}"}}]}, "metadata": {}, "dag": {"tasks": [{"name": "a-plus-b-0", "template": "a-plus-b", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}, {"name": "b", "value": "{{inputs.parameters.b}}"}]}}, {"name": "a-plus-b-plus-2-0", "template": "a-plus-b-plus-2", "arguments": {"parameters": [{"name": "a", "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}"}, {"name": "b", "value": "2"}]}, "depends": "a-plus-b-0"}]}}, {"name": "a-plus-b", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "a-plus-b-plus-2", "inputs": {"parameters": [{"name": "a", "default": "1"}, {"name": "b", "default": "2"}, {"name": "sum", "default": "null"}]}, "outputs": {"parameters": [{"name": "sum", "value_from": {"path": "sum"}}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}, {"name": "b", "value": "{{workflow.parameters.b}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "a", "value": "-100"}, {"name": "b", "value": "100"}]}, "service_account_name": "argo-workflow", "workflow_template_ref": {"name": "pipeline-test-parameter-pipeline-c877j"}}, "artifact_repository_ref": {"config_map": "artifact-repositories", "key": "bettmensch-ai-artifact-repository", "namespace": "argo", "artifact_repository": {"s3": {"endpoint": "s3.us-east-2.amazonaws.com", "bucket": "bettmensch-ai-artifact-repository", "insecure": true, "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}"}}}, "artifact_gc_status": {"not_specified": true}, "task_results_completion_status": {"pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": true, "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": true}}} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_3.json b/data_models/workflows/argo/argo_workflow_3.json index 5642533..5150a80 100644 --- a/data_models/workflows/argo/argo_workflow_3.json +++ b/data_models/workflows/argo/argo_workflow_3.json @@ -1,2491 +1 @@ -{ - "metadata": { - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "generate_name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-", - "namespace": "argo", - "uid": "bfe2cd60-7fa7-48ba-96f1-0845dbc142a8", - "resource_version": "16194", - "generation": 18, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "00:00:00:20" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-lightning-cpu-pipeline-c8drk" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "9/10", - "nodes": { - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "type": "DAG", - "display_name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "9/10", - "resources_duration": { - "cpu": 235, - "memory": 4168 - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449" - ], - "outbound_nodes": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-2(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-2(0)", - "template_name": "lightning-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 898 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.986543", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-3", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-3", - "template_name": "lightning-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 39, - "memory": 684 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.132619", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-1(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-1(0)", - "template_name": "lightning-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 899 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.512020", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-4(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-4(0)", - "template_name": "lightning-ddp-4", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 633 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.044815", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-203.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-5(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-5(0)", - "template_name": "lightning-ddp-5", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 28, - "memory": 519 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.170848", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-142.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0(1)", - "type": "Pod", - "display_name": "show-duration-param-0(1)", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 25 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "0:00:20.968705" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-delete-torch-service", - "type": "Pod", - "display_name": "lightning-ddp-delete-torch-service", - "template_name": "lightning-ddp-delete-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-142.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-create-torch-service", - "type": "Pod", - "display_name": "lightning-ddp-create-torch-service", - "template_name": "lightning-ddp-create-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0", - "type": "Retry", - "display_name": "lightning-ddp-0", - "template_name": "lightning-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "3/4", - "resources_duration": { - "cpu": 28, - "memory": 535 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.968705", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-4", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-4", - "template_name": "lightning-ddp-4", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 633 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.044815", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0(0)", - "type": "Pod", - "display_name": "lightning-ddp-0(0)", - "template_name": "lightning-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 27, - "memory": 510 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.968705", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499" - ], - "host_node_name": "ip-10-0-50-149.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-5", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-5", - "template_name": "lightning-ddp-5", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 28, - "memory": 519 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.170848", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-1", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-1", - "template_name": "lightning-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 899 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.512020", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-2", - "type": "Retry", - "display_name": "lightning-ddp-0-worker-2", - "template_name": "lightning-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 898 - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:20.986543", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-3(0)", - "type": "Pod", - "display_name": "lightning-ddp-0-worker-3(0)", - "template_name": "lightning-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 39, - "memory": 684 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30", - "value": "00:00:00:20" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "0:00:21.132619", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-49-32.us-east-2.compute.internal" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0", - "type": "Retry", - "display_name": "show-duration-param-0", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/2", - "resources_duration": { - "cpu": 1, - "memory": 25 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "0:00:20.968705" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979" - ] - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678": { - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0(0)", - "type": "Pod", - "display_name": "show-duration-param-0(0)", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Error", - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "message": "pod deleted", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "0/1", - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "0:00:20.968705" - } - ] - }, - "host_node_name": "ip-10-0-50-149.us-east-2.compute.internal" - } - }, - "stored_templates": { - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-4", - "template": "lightning-ddp-4", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-5", - "template": "lightning-ddp-5", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-0": { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-1": { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-2": { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-3": { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-4": { - "name": "lightning-ddp-4", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-5": { - "name": "lightning-ddp-5", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-create-torch-service": { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-delete-torch-service": { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ] - } - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/show-duration-param": { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 235, - "memory": 4168 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "lightning-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "lightning-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "lightning-ddp-create-torch-service", - "template": "lightning-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "lightning-ddp-0", - "template": "lightning-ddp-0", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-1", - "template": "lightning-ddp-1", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-2", - "template": "lightning-ddp-2", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-3", - "template": "lightning-ddp-3", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-4", - "template": "lightning-ddp-4", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-0-worker-5", - "template": "lightning-ddp-5", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "{{workflow.parameters.max_time}}" - } - ] - }, - "depends": "lightning-ddp-create-torch-service" - }, - { - "name": "lightning-ddp-delete-torch-service", - "template": "lightning-ddp-delete-torch-service", - "arguments": {}, - "depends": "lightning-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "lightning-ddp-0" - } - ] - } - }, - { - "name": "lightning-ddp-0", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-1", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-2", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-3", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-4", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "lightning-ddp-5", - "inputs": { - "parameters": [ - { - "name": "max_time", - "default": "00:00:00:30" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "max_time", - "value": "00:00:00:20" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-lightning-cpu-pipeline-c8drk" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678": true - } - } -} \ No newline at end of file +{"metadata": {"name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "generate_name": "pipeline-test-artifact-pipeline-jx7pb-flow-", "namespace": "argo", "uid": "e7dd825f-1f8c-4bdf-87ca-b38ae6cd773c", "resource_version": "7987", "generation": 7, "creation_timestamp": "07/12/2024", "labels": {"bettmensch.ai/pipeline-id": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92", "bettmensch.ai/pipeline-name": "pipeline-test-artifact-pipeline-jx7pb", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded"}, "annotations": {"karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2"}, "managed_fields": [{"manager": "argo", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:generateName": {}, "f:labels": {".": {}, "f:bettmensch.ai/pipeline-id": {}, "f:bettmensch.ai/pipeline-name": {}, "f:workflows.argoproj.io/creator": {}}}, "f:spec": {}}}, {"manager": "workflow-controller", "operation": "Update", "api_version": "argoproj.io/v1alpha1", "time": "07/12/2024", "fields_type": "FieldsV1", "fields_v1": {"f:metadata": {"f:annotations": {".": {}, "f:karpenter.sh/do-not-disrupt": {}, "f:workflows.argoproj.io/pod-name-format": {}}, "f:labels": {"f:workflows.argoproj.io/completed": {}, "f:workflows.argoproj.io/phase": {}}}, "f:status": {}}}]}, "spec": {"arguments": {"parameters": [{"name": "a", "value": "First integration test value a"}]}, "workflow_template_ref": {"name": "pipeline-test-artifact-pipeline-jx7pb"}}, "status": {"phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "nodes": {"pipeline-test-artifact-pipeline-jx7pb-flow-md47d": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "type": "DAG", "display_name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "phase": "Succeeded", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 68}, "children": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876"], "outbound_nodes": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188"]}, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0(0)", "type": "Pod", "display_name": "convert-to-artifact-0(0)", "template_name": "convert-to-artifact", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 43}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "a", "value": "First integration test value a"}, {"name": "a_art", "default": "null", "value": "null"}]}, "outputs": {"artifacts": [{"name": "a_art", "path": "a_art", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz"}}], "exit_code": "0"}, "children": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741"], "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal"}, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0(0)", "type": "Pod", "display_name": "show-artifact-0(0)", "template_name": "show-artifact", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 25}, "node_flag": {"retried": true}, "inputs": {"parameters": [{"name": "b", "default": "null", "value": "null"}], "artifacts": [{"name": "a", "path": "a", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz"}}]}, "outputs": {"artifacts": [{"name": "b", "path": "b", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz"}}], "exit_code": "0"}, "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal"}, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0", "type": "Retry", "display_name": "show-artifact-0", "template_name": "show-artifact", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "1/1", "resources_duration": {"cpu": 1, "memory": 25}, "inputs": {"parameters": [{"name": "b", "default": "null", "value": "null"}], "artifacts": [{"name": "a", "path": "a", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz"}}]}, "outputs": {"artifacts": [{"name": "b", "path": "b", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz"}}], "exit_code": "0"}, "children": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188"]}, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0", "type": "Retry", "display_name": "convert-to-artifact-0", "template_name": "convert-to-artifact", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 68}, "inputs": {"parameters": [{"name": "a", "value": "First integration test value a"}, {"name": "a_art", "default": "null", "value": "null"}]}, "outputs": {"artifacts": [{"name": "a_art", "path": "a_art", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz"}}], "exit_code": "0"}, "children": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518"]}, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876": {"id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag", "type": "DAG", "display_name": "bettmensch-ai-inner-dag", "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", "phase": "Succeeded", "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "started_at": "07/12/2024", "finished_at": "07/12/2024", "progress": "2/2", "resources_duration": {"cpu": 2, "memory": 68}, "inputs": {"parameters": [{"name": "a", "value": "First integration test value a"}]}, "outputs": {"artifacts": [{"name": "b", "path": "b", "s3": {"key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz"}}]}, "children": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243"], "outbound_nodes": ["pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188"]}}, "stored_templates": {"namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-inner-dag": {"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "Param A"}]}, "outputs": {"artifacts": [{"name": "b", "_from": "{{tasks.show-artifact-0.outputs.artifacts.b}}"}]}, "metadata": {}, "dag": {"tasks": [{"name": "convert-to-artifact-0", "template": "convert-to-artifact", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}]}}, {"name": "show-artifact-0", "template": "show-artifact", "arguments": {"artifacts": [{"name": "a", "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}"}]}, "depends": "convert-to-artifact-0"}]}}, "namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-outer-dag": {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}]}}]}}, "namespaced/pipeline-test-artifact-pipeline-jx7pb/convert-to-artifact": {"name": "convert-to-artifact", "inputs": {"parameters": [{"name": "a"}, {"name": "a_art", "default": "null"}]}, "outputs": {"artifacts": [{"name": "a_art", "path": "a_art"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, "namespaced/pipeline-test-artifact-pipeline-jx7pb/show-artifact": {"name": "show-artifact", "inputs": {"parameters": [{"name": "b", "default": "null"}], "artifacts": [{"name": "a", "path": "a"}]}, "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}}, "conditions": [{"type": "PodRunning", "status": "False"}, {"type": "Completed", "status": "True"}], "resources_duration": {"cpu": 2, "memory": 68}, "stored_workflow_template_spec": {"templates": [{"name": "bettmensch-ai-inner-dag", "inputs": {"parameters": [{"name": "a", "value": "Param A"}]}, "outputs": {"artifacts": [{"name": "b", "_from": "{{tasks.show-artifact-0.outputs.artifacts.b}}"}]}, "metadata": {}, "dag": {"tasks": [{"name": "convert-to-artifact-0", "template": "convert-to-artifact", "arguments": {"parameters": [{"name": "a", "value": "{{inputs.parameters.a}}"}]}}, {"name": "show-artifact-0", "template": "show-artifact", "arguments": {"artifacts": [{"name": "a", "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}"}]}, "depends": "convert-to-artifact-0"}]}}, {"name": "convert-to-artifact", "inputs": {"parameters": [{"name": "a"}, {"name": "a_art", "default": "null"}]}, "outputs": {"artifacts": [{"name": "a_art", "path": "a_art"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "show-artifact", "inputs": {"parameters": [{"name": "b", "default": "null"}], "artifacts": [{"name": "a", "path": "a"}]}, "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, "metadata": {}, "script": {"image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n", "name": "", "command": ["python"], "resources": {"limits": {"cpu": "100m", "memory": "100Mi"}, "requests": {"cpu": "100m", "memory": "100Mi"}}, "image_pull_policy": "Always"}, "retry_strategy": {"limit": "1", "retry_policy": "OnError"}}, {"name": "bettmensch-ai-outer-dag", "inputs": {}, "outputs": {}, "metadata": {}, "dag": {"tasks": [{"name": "bettmensch-ai-inner-dag", "template": "bettmensch-ai-inner-dag", "arguments": {"parameters": [{"name": "a", "value": "{{workflow.parameters.a}}"}]}}]}}], "entrypoint": "bettmensch-ai-outer-dag", "arguments": {"parameters": [{"name": "a", "value": "First integration test value a"}]}, "service_account_name": "argo-workflow", "workflow_template_ref": {"name": "pipeline-test-artifact-pipeline-jx7pb"}}, "artifact_repository_ref": {"config_map": "artifact-repositories", "key": "bettmensch-ai-artifact-repository", "namespace": "argo", "artifact_repository": {"s3": {"endpoint": "s3.us-east-2.amazonaws.com", "bucket": "bettmensch-ai-artifact-repository", "insecure": true, "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}"}}}, "artifact_gc_status": {"not_specified": true}, "task_results_completion_status": {"pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": true, "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": true}}} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_4.json b/data_models/workflows/argo/argo_workflow_4.json deleted file mode 100644 index e155f7b..0000000 --- a/data_models/workflows/argo/argo_workflow_4.json +++ /dev/null @@ -1,1993 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "generate_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-", - "namespace": "argo", - "uid": "93098e5d-b8fe-4e2a-83d8-e19b7489c980", - "resource_version": "13587", - "generation": 13, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "value": "5" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-torch-gpu-pipeline-dcfq8" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "7/7", - "nodes": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "type": "DAG", - "display_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "7/7", - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500 - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784" - ], - "outbound_nodes": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-delete-torch-service", - "type": "Pod", - "display_name": "torch-ddp-delete-torch-service", - "template_name": "torch-ddp-delete-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0", - "type": "Retry", - "display_name": "show-duration-param-0", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "60" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-1(0)", - "template_name": "torch-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-242.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3", - "type": "Retry", - "display_name": "torch-ddp-0-worker-3", - "template_name": "torch-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2", - "type": "Retry", - "display_name": "torch-ddp-0-worker-2", - "template_name": "torch-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0(0)", - "type": "Pod", - "display_name": "torch-ddp-0(0)", - "template_name": "torch-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 15, - "memory": 1112, - "nvidia.com/gpu": 138 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811" - ], - "host_node_name": "ip-10-0-49-47.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-3(0)", - "template_name": "torch-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-49-43.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1", - "type": "Retry", - "display_name": "torch-ddp-0-worker-1", - "template_name": "torch-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-2(0)", - "template_name": "torch-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-184.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-create-torch-service", - "type": "Pod", - "display_name": "torch-ddp-create-torch-service", - "template_name": "torch-ddp-create-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0", - "type": "Retry", - "display_name": "torch-ddp-0", - "template_name": "torch-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "3/3", - "resources_duration": { - "cpu": 16, - "memory": 1135, - "nvidia.com/gpu": 138 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662" - ] - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": { - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0(0)", - "type": "Pod", - "display_name": "show-duration-param-0(0)", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "60" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - } - }, - "stored_templates": { - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/show-duration-param": { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-0": { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-1": { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-2": { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-3": { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-create-torch-service": { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-delete-torch-service": { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ] - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "value": "5" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-torch-gpu-pipeline-dcfq8" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_5.json b/data_models/workflows/argo/argo_workflow_5.json deleted file mode 100644 index 7fd96f9..0000000 --- a/data_models/workflows/argo/argo_workflow_5.json +++ /dev/null @@ -1,2627 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "generate_name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-", - "namespace": "argo", - "uid": "c085649e-4392-4616-b1fd-2e553aebd469", - "resource_version": "11623", - "generation": 11, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "value": "5" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-torch-cpu-pipeline-2n6rx" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "9/9", - "nodes": { - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "type": "DAG", - "display_name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "9/9", - "resources_duration": { - "cpu": 105, - "memory": 3878 - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175" - ], - "outbound_nodes": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-create-torch-service", - "type": "Pod", - "display_name": "torch-ddp-create-torch-service", - "template_name": "torch-ddp-create-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-4", - "type": "Retry", - "display_name": "torch-ddp-0-worker-4", - "template_name": "torch-ddp-4", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 669 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-5", - "type": "Retry", - "display_name": "torch-ddp-0-worker-5", - "template_name": "torch-ddp-5", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-2", - "type": "Retry", - "display_name": "torch-ddp-0-worker-2", - "template_name": "torch-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 657 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-3", - "type": "Retry", - "display_name": "torch-ddp-0-worker-3", - "template_name": "torch-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.show-duration-param-0(0)", - "type": "Pod", - "display_name": "show-duration-param-0(0)", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "60" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-1", - "type": "Retry", - "display_name": "torch-ddp-0-worker-1", - "template_name": "torch-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-3(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-3(0)", - "template_name": "torch-ddp-3", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0", - "type": "Retry", - "display_name": "torch-ddp-0", - "template_name": "torch-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "3/3", - "resources_duration": { - "cpu": 19, - "memory": 689 - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-2(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-2(0)", - "template_name": "torch-ddp-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 657 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-1(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-1(0)", - "template_name": "torch-ddp-1", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0(0)", - "type": "Pod", - "display_name": "torch-ddp-0(0)", - "template_name": "torch-ddp-0", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 665 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-delete-torch-service", - "type": "Pod", - "display_name": "torch-ddp-delete-torch-service", - "template_name": "torch-ddp-delete-torch-service", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.show-duration-param-0", - "type": "Retry", - "display_name": "show-duration-param-0", - "template_name": "show-duration-param", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "60" - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924" - ] - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-5(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-5(0)", - "template_name": "torch-ddp-5", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341": { - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341", - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-4(0)", - "type": "Pod", - "display_name": "torch-ddp-0-worker-4(0)", - "template_name": "torch-ddp-4", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 669 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "default": "10", - "value": "5" - }, - { - "name": "duration", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value": "60", - "value_from": { - "path": "duration" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - } - }, - "stored_templates": { - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-4", - "template": "torch-ddp-4", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-5", - "template": "torch-ddp-5", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/show-duration-param": { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-0": { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-1": { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-2": { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-3": { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-4": { - "name": "torch-ddp-4", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-5": { - "name": "torch-ddp-5", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-create-torch-service": { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-delete-torch-service": { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ] - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 105, - "memory": 3878 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "torch-ddp-create-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "create", - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n" - } - }, - { - "name": "torch-ddp-delete-torch-service", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "resource": { - "action": "delete", - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ] - } - }, - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "torch-ddp-create-torch-service", - "template": "torch-ddp-create-torch-service", - "arguments": {} - }, - { - "name": "torch-ddp-0", - "template": "torch-ddp-0", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-1", - "template": "torch-ddp-1", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-2", - "template": "torch-ddp-2", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-3", - "template": "torch-ddp-3", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-4", - "template": "torch-ddp-4", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-0-worker-5", - "template": "torch-ddp-5", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}" - }, - { - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}" - } - ] - }, - "depends": "torch-ddp-create-torch-service" - }, - { - "name": "torch-ddp-delete-torch-service", - "template": "torch-ddp-delete-torch-service", - "arguments": {}, - "depends": "torch-ddp-0" - }, - { - "name": "show-duration-param-0", - "template": "show-duration-param", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" - } - ] - }, - "depends": "torch-ddp-0" - } - ] - } - }, - { - "name": "torch-ddp-0", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "ports": [ - { - "container_port": 29200, - "name": "ddp", - "protocol": "TCP" - } - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-1", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-2", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-3", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-4", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "torch-ddp-5", - "inputs": { - "parameters": [ - { - "name": "n_iter", - "default": "100" - }, - { - "name": "n_seconds_sleep", - "default": "10" - }, - { - "name": "duration", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "duration", - "value_from": { - "path": "duration" - } - } - ] - }, - "metadata": { - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "script": { - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "name": "", - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO" - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6" - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5" - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local" - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200" - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1" - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0" - } - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - }, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}" - }, - { - "name": "show-duration-param", - "inputs": { - "parameters": [ - { - "name": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "n_iter", - "value": "12" - }, - { - "name": "n_seconds_sleep", - "value": "5" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-torch-cpu-pipeline-2n6rx" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_6.json b/data_models/workflows/argo/argo_workflow_6.json deleted file mode 100644 index a5d2063..0000000 --- a/data_models/workflows/argo/argo_workflow_6.json +++ /dev/null @@ -1,670 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "generate_name": "pipeline-test-parameter-pipeline-mhwgd-flow-", - "namespace": "argo", - "uid": "8ba8d28a-5dd1-4234-a5e4-364ba12ab24b", - "resource_version": "10156", - "generation": 7, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "a", - "value": "-100" - }, - { - "name": "b", - "value": "100" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "nodes": { - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "type": "DAG", - "display_name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557" - ], - "outbound_nodes": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140" - ] - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-plus-2-0", - "type": "Retry", - "display_name": "a-plus-b-plus-2-0", - "template_name": "a-plus-b-plus-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "0" - }, - { - "name": "b", - "default": "2", - "value": "2" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "2", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140" - ] - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-0(0)", - "type": "Pod", - "display_name": "a-plus-b-0(0)", - "template_name": "a-plus-b", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "-100" - }, - { - "name": "b", - "default": "2", - "value": "100" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "0", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-plus-2-0(0)", - "type": "Pod", - "display_name": "a-plus-b-plus-2-0(0)", - "template_name": "a-plus-b-plus-2", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "0" - }, - { - "name": "b", - "default": "2", - "value": "2" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "2", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557": { - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557", - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-0", - "type": "Retry", - "display_name": "a-plus-b-0", - "template_name": "a-plus-b", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1", - "value": "-100" - }, - { - "name": "b", - "default": "2", - "value": "100" - }, - { - "name": "sum", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value": "0", - "value_from": { - "path": "sum" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908" - ] - } - }, - "stored_templates": { - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b": { - "name": "a-plus-b", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b-plus-2": { - "name": "a-plus-b-plus-2", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "a-plus-b-0", - "template": "a-plus-b", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - }, - { - "name": "b", - "value": "{{workflow.parameters.b}}" - } - ] - } - }, - { - "name": "a-plus-b-plus-2-0", - "template": "a-plus-b-plus-2", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" - }, - { - "name": "b", - "value": "2" - } - ] - }, - "depends": "a-plus-b-0" - } - ] - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "a-plus-b-0", - "template": "a-plus-b", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - }, - { - "name": "b", - "value": "{{workflow.parameters.b}}" - } - ] - } - }, - { - "name": "a-plus-b-plus-2-0", - "template": "a-plus-b-plus-2", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" - }, - { - "name": "b", - "value": "2" - } - ] - }, - "depends": "a-plus-b-0" - } - ] - } - }, - { - "name": "a-plus-b", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "a-plus-b-plus-2", - "inputs": { - "parameters": [ - { - "name": "a", - "default": "1" - }, - { - "name": "b", - "default": "2" - }, - { - "name": "sum", - "default": "null" - } - ] - }, - "outputs": { - "parameters": [ - { - "name": "sum", - "value_from": { - "path": "sum" - } - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "-100" - }, - { - "name": "b", - "value": "100" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908": true, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/argo/argo_workflow_7.json b/data_models/workflows/argo/argo_workflow_7.json deleted file mode 100644 index acbc49a..0000000 --- a/data_models/workflows/argo/argo_workflow_7.json +++ /dev/null @@ -1,552 +0,0 @@ -{ - "metadata": { - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "generate_name": "pipeline-test-artifact-pipeline-d5rzf-flow-", - "namespace": "argo", - "uid": "9948f727-967a-4905-800e-ec80117d8398", - "resource_version": "9912", - "generation": 7, - "creation_timestamp": "test-datetime-value", - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "managed_fields": [ - { - "manager": "argo", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:generateName": {}, - "f:labels": { - ".": {}, - "f:workflows.argoproj.io/creator": {} - } - }, - "f:spec": {} - } - }, - { - "manager": "workflow-controller", - "operation": "Update", - "api_version": "argoproj.io/v1alpha1", - "time": "test-datetime-value", - "fields_type": "FieldsV1", - "fields_v1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:karpenter.sh/do-not-disrupt": {}, - "f:workflows.argoproj.io/pod-name-format": {} - }, - "f:labels": { - "f:workflows.argoproj.io/completed": {}, - "f:workflows.argoproj.io/phase": {} - } - }, - "f:status": {} - } - } - ] - }, - "spec": { - "arguments": { - "parameters": [ - { - "name": "a", - "value": "First integration test value a" - } - ] - }, - "workflow_template_ref": { - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "status": { - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "nodes": { - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "type": "DAG", - "display_name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "template_name": "bettmensch-ai-dag", - "template_scope": "local/", - "phase": "Succeeded", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103" - ], - "outbound_nodes": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088" - ] - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.show-artifact-0(0)", - "type": "Pod", - "display_name": "show-artifact-0(0)", - "template_name": "show-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz" - } - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.convert-to-artifact-0", - "type": "Retry", - "display_name": "convert-to-artifact-0", - "template_name": "convert-to-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "2/2", - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "First integration test value a" - }, - { - "name": "a_art", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882" - ] - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.convert-to-artifact-0(0)", - "type": "Pod", - "display_name": "convert-to-artifact-0(0)", - "template_name": "convert-to-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 2, - "memory": 140 - }, - "node_flag": { - "retried": true - }, - "inputs": { - "parameters": [ - { - "name": "a", - "value": "First integration test value a" - }, - { - "name": "a_art", - "default": "null", - "value": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz" - } - } - ], - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737" - ], - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737": { - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737", - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.show-artifact-0", - "type": "Retry", - "display_name": "show-artifact-0", - "template_name": "show-artifact", - "template_scope": "local/", - "phase": "Succeeded", - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "started_at": "test-datetime-value", - "finished_at": "test-datetime-value", - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a", - "s3": { - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz" - } - } - ] - }, - "outputs": { - "exit_code": "0" - }, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088" - ] - } - }, - "stored_templates": { - "namespaced/pipeline-test-artifact-pipeline-d5rzf/bettmensch-ai-dag": { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "convert-to-artifact-0", - "template": "convert-to-artifact", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - } - ] - } - }, - { - "name": "show-artifact-0", - "template": "show-artifact", - "arguments": { - "artifacts": [ - { - "name": "a", - "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}" - } - ] - }, - "depends": "convert-to-artifact-0" - } - ] - } - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/convert-to-artifact": { - "name": "convert-to-artifact", - "inputs": { - "parameters": [ - { - "name": "a" - }, - { - "name": "a_art", - "default": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art" - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/show-artifact": { - "name": "show-artifact", - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - }, - "conditions": [ - { - "type": "PodRunning", - "status": "False" - }, - { - "type": "Completed", - "status": "True" - } - ], - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "stored_workflow_template_spec": { - "templates": [ - { - "name": "bettmensch-ai-dag", - "inputs": {}, - "outputs": {}, - "metadata": {}, - "dag": { - "tasks": [ - { - "name": "convert-to-artifact-0", - "template": "convert-to-artifact", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "{{workflow.parameters.a}}" - } - ] - } - }, - { - "name": "show-artifact-0", - "template": "show-artifact", - "arguments": { - "artifacts": [ - { - "name": "a", - "_from": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}" - } - ] - }, - "depends": "convert-to-artifact-0" - } - ] - } - }, - { - "name": "convert-to-artifact", - "inputs": { - "parameters": [ - { - "name": "a" - }, - { - "name": "a_art", - "default": "null" - } - ] - }, - "outputs": { - "artifacts": [ - { - "name": "a_art", - "path": "a_art" - } - ] - }, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - }, - { - "name": "show-artifact", - "inputs": { - "artifacts": [ - { - "name": "a", - "path": "a" - } - ] - }, - "outputs": {}, - "metadata": {}, - "script": { - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "name": "", - "command": [ - "python" - ], - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "image_pull_policy": "Always" - }, - "retry_strategy": { - "limit": "1", - "retry_policy": "OnError" - } - } - ], - "entrypoint": "bettmensch-ai-dag", - "arguments": { - "parameters": [ - { - "name": "a", - "value": "First integration test value a" - } - ] - }, - "service_account_name": "argo-workflow", - "workflow_template_ref": { - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "artifact_repository_ref": { - "config_map": "artifact-repositories", - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - "artifact_repository": { - "s3": { - "endpoint": "s3.us-east-2.amazonaws.com", - "bucket": "bettmensch-ai-artifact-repository", - "insecure": true - } - } - }, - "artifact_gc_status": { - "not_specified": true - }, - "task_results_completion_status": { - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088": true, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_0.json b/data_models/workflows/hera/hera_workflow_0.json index 258f8b6..edc6638 100644 --- a/data_models/workflows/hera/hera_workflow_0.json +++ b/data_models/workflows/hera/hera_workflow_0.json @@ -1,19 +1,15 @@ { - "api_version": null, - "kind": null, "metadata": { "annotations": { "karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2" }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-parameter-pipeline-mhwgd-flow-", - "generation": 6, + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-", + "generation": 13, "labels": { + "bettmensch.ai/pipeline-id": "612226a1-b40f-4f68-92c3-ea8a5d6b3995", + "bettmensch.ai/pipeline-name": "pipeline-test-torch-gpu-pipeline-7c4zp", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded" @@ -25,8 +21,7 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" }, { "api_version": "argoproj.io/v1alpha1", @@ -34,649 +29,581 @@ "fields_v1": {}, "manager": "workflow-controller", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "namespace": "argo", - "owner_references": null, - "resource_version": "18503", - "self_link": null, - "uid": "ddfe31ae-1231-4a2d-be6c-4b712bcc15a6" + "resource_version": "11463", + "uid": "ae69b1e3-a235-44d5-8667-bef63fc15821" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-10", - "value_from": null + "name": "n_iter", + "value": "15" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "20", - "value_from": null + "name": "n_seconds_sleep", + "value": "2" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-parameter-pipeline-mhwgd" + "name": "pipeline-test-torch-gpu-pipeline-7c4zp" } }, "status": { "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null + "not_specified": true }, "artifact_repository_ref": { "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, "s3": { - "access_key_secret": null, "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, "endpoint": "s3.us-east-2.amazonaws.com", "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}" } }, "config_map": "artifact-repositories", - "default": null, "key": "bettmensch-ai-artifact-repository", "namespace": "argo" }, - "compressed_nodes": null, "conditions": [ { - "message": null, "status": "False", "type": "PodRunning" }, { - "message": null, "status": "True", "type": "Completed" } ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, + "finished_at": "07/12/2024", "nodes": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp": { - "boundary_id": null, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf": { "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729" + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060" ], - "daemoned": null, - "display_name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "node_flag": null, + "display_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "outbound_nodes": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231" ], - "outputs": null, "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", + "progress": "5/5", "resources_duration": { - "cpu": 2, - "memory": 47 + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "type": "DAG" }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "children": null, - "daemoned": null, - "display_name": "a-plus-b-plus-2-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "torch-ddp-delete-torch-ddp-service", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-delete-torch-ddp-service", + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 0, + "memory": 0 + }, + "started_at": "07/12/2024", + "template_name": "torch-ddp-delete-torch-ddp-service", + "template_scope": "local/", + "type": "Pod" + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231" + ], + "display_name": "torch-ddp-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-50-210.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387", "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "10", - "value_from": null + "default": "100", + "name": "n_iter", + "value": "15" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null + "default": "10", + "name": "n_seconds_sleep", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null + "name": "duration", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0(0)", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "12", + "name": "duration", + "value": "30", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 1, - "memory": 23 + "cpu": 11, + "memory": 839, + "nvidia.com/gpu": 99 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b-plus-2", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-0", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059" ], - "daemoned": null, - "display_name": "a-plus-b-plus-2-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217", + "display_name": "torch-ddp-create-torch-ddp-service", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-create-torch-ddp-service", + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 0, + "memory": 1 + }, + "started_at": "07/12/2024", + "template_name": "torch-ddp-create-torch-ddp-service", + "template_scope": "local/", + "type": "Pod" + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694" + ], + "display_name": "show-duration-param-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "10", - "value_from": null + "value": "30" + } + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0", + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 1, + "memory": 27 + }, + "started_at": "07/12/2024", + "template_name": "show-duration-param", + "template_scope": "local/", + "type": "Retry" + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430" + ], + "display_name": "torch-ddp-0-worker-1", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null + "default": "10", + "name": "n_seconds_sleep", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null + "name": "duration", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1", "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "12", + "name": "duration", + "value": "30", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 1, - "memory": 23 + "cpu": 11, + "memory": 777, + "nvidia.com/gpu": 91 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b-plus-2", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-1", "template_scope": "local/", "type": "Retry" }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217" + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252" ], - "daemoned": null, - "display_name": "a-plus-b-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680", + "display_name": "bettmensch-ai-inner-dag", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-10", - "value_from": null + "name": "n_iter", + "value": "15" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "20", - "value_from": null + "name": "n_seconds_sleep", + "value": "2" + } + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231" + ], + "phase": "Succeeded", + "progress": "5/5", + "resources_duration": { + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190 + }, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-inner-dag", + "template_scope": "local/", + "type": "DAG" + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "torch-ddp-0-worker-1(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-50-218.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15" + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null + "name": "duration", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0(0)", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "10", + "name": "duration", + "value": "30", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 1, - "memory": 24 + "cpu": 11, + "memory": 777, + "nvidia.com/gpu": 91 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-1", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680" + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387" ], - "daemoned": null, - "display_name": "a-plus-b-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729", + "display_name": "torch-ddp-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-10", - "value_from": null + "default": "100", + "name": "n_iter", + "value": "15" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "20", - "value_from": null + "default": "10", + "name": "n_seconds_sleep", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null + "name": "duration", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0", "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "10", + "name": "duration", + "value": "30", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", + "progress": "3/3", "resources_duration": { - "cpu": 2, - "memory": 47 + "cpu": 12, + "memory": 866, + "nvidia.com/gpu": 99 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-0", "template_scope": "local/", "type": "Retry" + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "show-duration-param-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "inputs": { + "parameters": [ + { + "name": "a", + "value": "30" + } + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0(0)", + "node_flag": { + "retried": true + }, + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 1, + "memory": 27 + }, + "started_at": "07/12/2024", + "template_name": "show-duration-param", + "template_scope": "local/", + "type": "Pod" } }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, "phase": "Succeeded", - "progress": "2/2", + "progress": "5/5", "resources_duration": { - "cpu": 2, - "memory": 47 + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190 }, - "started_at": "test-datetime-value", + "started_at": "07/12/2024", "stored_templates": { - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-inner-dag": { + "dag": { + "tasks": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" + } + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" + } + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" + } + ] + }, + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null + "inputs": { + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "metrics": null, - "name": "a-plus-b", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/show-duration-param": { + "inputs": { "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } + "name": "a" } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -687,744 +614,663 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b-plus-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-0": { "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0" + } }, - "metrics": null, - "name": "a-plus-b-plus-2", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, + "name": "duration", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP" + } + ], "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-1": { + "inputs": { + "parameters": [ { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "a-plus-b-0", - "on_exit": null, - "template": "a-plus-b", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "default": "100", + "name": "n_iter" }, { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "a-plus-b-0", - "hooks": null, - "inline": null, - "name": "a-plus-b-plus-2-0", - "on_exit": null, - "template": "a-plus-b-plus-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "default": "10", + "name": "n_seconds_sleep" + }, + { + "default": "null", + "name": "duration" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1" + } }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "duration", + "value_from": { + "path": "duration" + } + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "script": { + "command": [ + "python" + ], + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-create-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-delete-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo" + ] + } } }, "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-10", - "value_from": null + "name": "n_iter", + "value": "15" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "20", - "value_from": null + "name": "n_seconds_sleep", + "value": "2" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, + "entrypoint": "bettmensch-ai-outer-dag", "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } + }, + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo" + ] + } + }, + { "dag": { - "fail_fast": null, - "target": null, "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" + }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "a-plus-b-0", - "on_exit": null, - "template": "a-plus-b", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "a-plus-b-0", - "hooks": null, - "inline": null, - "name": "a-plus-b-plus-2-0", - "on_exit": null, - "template": "a-plus-b-plus-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" + } + ] + }, + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0" + } }, - "metrics": null, - "name": "a-plus-b", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, + "name": "duration", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP" + } + ], "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "700Mi", + "nvidia.com/gpu": "1" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" }, { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1" + } }, - "metrics": null, - "name": "a-plus-b-plus-2", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, + "name": "duration", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null + "path": "duration" } } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" + }, + "script": { + "command": [ + "python" + ], + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } ], - "result": null + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists" + } + ] + }, + { + "inputs": { + "parameters": [ + { + "name": "a" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -1435,44 +1281,46 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-parameter-pipeline-mhwgd" + "name": "pipeline-test-torch-gpu-pipeline-7c4zp" } }, - "synchronization": null, "task_results_completion_status": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": true, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": true + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": true, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": true, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": true, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": true, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": true } } } \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_1.json b/data_models/workflows/hera/hera_workflow_1.json index 1ea6e95..b5d9088 100644 --- a/data_models/workflows/hera/hera_workflow_1.json +++ b/data_models/workflows/hera/hera_workflow_1.json @@ -1,19 +1,15 @@ { - "api_version": null, - "kind": null, "metadata": { "annotations": { "karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2" }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-artifact-pipeline-d5rzf-flow-", - "generation": 6, + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-", + "generation": 12, "labels": { + "bettmensch.ai/pipeline-id": "9de5c132-b8d2-44c8-b52e-47bfa710b7df", + "bettmensch.ai/pipeline-name": "pipeline-test-torch-cpu-pipeline-hgcxv", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded" @@ -25,8 +21,7 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" }, { "api_version": "argoproj.io/v1alpha1", @@ -34,1317 +29,1205 @@ "fields_v1": {}, "manager": "workflow-controller", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "namespace": "argo", - "owner_references": null, - "resource_version": "18180", - "self_link": null, - "uid": "dc477fa6-dd12-43b7-8511-e3dc03bf023c" + "resource_version": "9564", + "uid": "15d5987d-9e1c-4606-82a0-611e6c7b19ee" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "Second integration test value a", - "value_from": null + "name": "n_iter", + "value": "15" + }, + { + "name": "n_seconds_sleep", + "value": "2" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-artifact-pipeline-d5rzf" + "name": "pipeline-test-torch-cpu-pipeline-hgcxv" } }, "status": { "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null + "not_specified": true }, "artifact_repository_ref": { "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, "s3": { - "access_key_secret": null, "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, "endpoint": "s3.us-east-2.amazonaws.com", "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}" } }, "config_map": "artifact-repositories", - "default": null, "key": "bettmensch-ai-artifact-repository", "namespace": "argo" }, - "compressed_nodes": null, "conditions": [ { - "message": null, "status": "False", "type": "PodRunning" }, { - "message": null, "status": "True", "type": "Completed" } ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, + "finished_at": "07/12/2024", "nodes": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9": { - "boundary_id": null, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9": { "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393" + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078" ], - "daemoned": null, - "display_name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "node_flag": null, + "display_name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", "outbound_nodes": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825" ], - "outputs": null, "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", + "progress": "5/5", "resources_duration": { - "cpu": 2, - "memory": 48 + "cpu": 26, + "memory": 1054 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "type": "DAG" }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", + "display_name": "torch-ddp-0-worker-1(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15" + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2" + }, + { + "default": "null", + "name": "duration", + "value": "null" + } + ] + }, + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0-worker-1(0)", + "node_flag": { + "retried": true + }, + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": { + "path": "duration" + } + } + ] + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 12, + "memory": 496 + }, + "started_at": "07/12/2024", + "template_name": "torch-ddp-1", + "template_scope": "local/", + "type": "Pod" + }, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", + "display_name": "torch-ddp-delete-torch-ddp-service", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-delete-torch-ddp-service", + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 0, + "memory": 0 + }, + "started_at": "07/12/2024", + "template_name": "torch-ddp-delete-torch-ddp-service", + "template_scope": "local/", + "type": "Pod" + }, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", + "children": [ + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069" + ], + "display_name": "torch-ddp-create-torch-ddp-service", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-create-torch-ddp-service", + "outputs": { + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 0, + "memory": 1 + }, + "started_at": "07/12/2024", + "template_name": "torch-ddp-create-torch-ddp-service", + "template_scope": "local/", + "type": "Pod" + }, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249" ], - "daemoned": null, - "display_name": "show-artifact-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743", + "display_name": "torch-ddp-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2699628426", "inputs": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null + "default": "100", + "name": "n_iter", + "value": "15" + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2" + }, + { + "default": "null", + "name": "duration", + "value": "null" } - ], - "parameters": null + ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0", "outputs": { - "artifacts": null, "exit_code": "0", - "parameters": null, - "result": null + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": { + "path": "duration" + } + } + ] }, "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", + "progress": "3/3", "resources_duration": { - "cpu": 1, - "memory": 24 + "cpu": 14, + "memory": 557 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-artifact", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-0", "template_scope": "local/", "type": "Retry" }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743" + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825" ], - "daemoned": null, - "display_name": "convert-to-artifact-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056", + "display_name": "torch-ddp-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249", "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "Second integration test value a", - "value_from": null + "default": "100", + "name": "n_iter", + "value": "15" + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": "null", - "value_from": null + "name": "duration", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0(0)", + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": [ + "exit_code": "0", + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null + "name": "duration", + "value": "30", + "value_from": { + "path": "duration" + } } - ], - "exit_code": "0", - "parameters": null, - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 1, - "memory": 24 + "cpu": 13, + "memory": 527 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "convert-to-artifact", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-0", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "children": null, - "daemoned": null, - "display_name": "show-artifact-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9", + "children": [ + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150" + ], + "display_name": "bettmensch-ai-inner-dag", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "inputs": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null + "name": "n_iter", + "value": "15" + }, + { + "name": "n_seconds_sleep", + "value": "2" } - ], - "parameters": null + ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0(0)", - "node_flag": { - "hooked": null, - "retried": true + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825" + ], + "phase": "Succeeded", + "progress": "5/5", + "resources_duration": { + "cpu": 26, + "memory": 1054 + }, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-inner-dag", + "template_scope": "local/", + "type": "DAG" + }, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", + "children": [ + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172" + ], + "display_name": "torch-ddp-0-worker-1", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3370869069", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15" + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2" + }, + { + "default": "null", + "name": "duration", + "value": "null" + } + ] }, - "outbound_nodes": null, + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.torch-ddp-0-worker-1", "outputs": { - "artifacts": null, "exit_code": "0", - "parameters": null, - "result": null + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": { + "path": "duration" + } + } + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 1, - "memory": 24 + "cpu": 12, + "memory": 496 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-artifact", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "torch-ddp-1", "template_scope": "local/", - "type": "Pod" + "type": "Retry" }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056" + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508" ], - "daemoned": null, - "display_name": "convert-to-artifact-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393", + "display_name": "show-duration-param-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3901607477", "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "Second integration test value a", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": "null", - "value_from": null + "value": "30" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.show-duration-param-0", "outputs": { - "artifacts": [ + "exit_code": "0" + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 1, + "memory": 30 + }, + "started_at": "07/12/2024", + "template_name": "show-duration-param", + "template_scope": "local/", + "type": "Retry" + }, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508": { + "boundary_id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-3111033078", + "display_name": "show-duration-param-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508", + "inputs": { + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null + "name": "a", + "value": "30" } - ], - "exit_code": "0", - "parameters": null, - "result": null + ] + }, + "name": "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9.bettmensch-ai-inner-dag.show-duration-param-0(0)", + "node_flag": { + "retried": true + }, + "outputs": { + "exit_code": "0" }, "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", + "progress": "1/1", "resources_duration": { - "cpu": 2, - "memory": 48 + "cpu": 1, + "memory": 30 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "convert-to-artifact", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "show-duration-param", "template_scope": "local/", - "type": "Retry" + "type": "Pod" } }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, "phase": "Succeeded", - "progress": "2/2", + "progress": "5/5", "resources_duration": { - "cpu": 2, - "memory": 48 + "cpu": 26, + "memory": 1054 }, - "started_at": "test-datetime-value", + "started_at": "07/12/2024", "stored_templates": { - "namespaced/pipeline-test-artifact-pipeline-d5rzf/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/bettmensch-ai-inner-dag": { "dag": { - "fail_fast": null, - "target": null, "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" + }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" + } + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "convert-to-artifact-0", - "on_exit": null, - "template": "convert-to-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" }, { "arguments": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, "name": "a", - "optional": null, - "oss": null, - "path": null, - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" } - ], - "parameters": null + ] }, - "continue_on": null, - "dependencies": null, - "depends": "convert-to-artifact-0", - "hooks": null, - "inline": null, - "name": "show-artifact-0", - "on_exit": null, - "template": "show-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} + }, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} + }, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/show-duration-param": { + "inputs": { + "parameters": [ + { + "name": "a" + } + ] + }, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "script": { + "command": [ + "python" + ], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "100Mi" + }, + "requests": { + "cpu": "100m", + "memory": "100Mi" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/convert-to-artifact": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-0": { "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" + }, + { + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0" + } }, - "metrics": null, - "name": "convert-to-artifact", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "name": "duration", + "value_from": { + "path": "duration" + } } - ], - "exit_code": null, - "parameters": null, - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP" + } + ], "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/show-artifact": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-1": { "inputs": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "default": "100", + "name": "n_iter" + }, + { + "default": "10", + "name": "n_seconds_sleep" + }, + { + "default": "null", + "name": "duration" } - ], - "parameters": null + ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1" + } }, - "metrics": null, - "name": "show-artifact", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "duration", + "value_from": { + "path": "duration" + } + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } + }, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-create-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } + }, + "namespaced/pipeline-test-torch-cpu-pipeline-hgcxv/torch-ddp-delete-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo" + ] + } } }, "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "Second integration test value a", - "value_from": null + "name": "n_iter", + "value": "15" + }, + { + "name": "n_seconds_sleep", + "value": "2" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, + "entrypoint": "bettmensch-ai-outer-dag", "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n" + } + }, + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo" + ] + } + }, + { "dag": { - "fail_fast": null, - "target": null, "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service" + }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "convert-to-artifact-0", - "on_exit": null, - "template": "convert-to-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0" }, { "arguments": { - "artifacts": [ + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}" + } + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1" + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service" + }, + { + "arguments": { + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, "name": "a", - "optional": null, - "oss": null, - "path": null, - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}" } - ], - "parameters": null + ] }, - "continue_on": null, - "dependencies": null, - "depends": "convert-to-artifact-0", - "hooks": null, - "inline": null, - "name": "show-artifact-0", - "on_exit": null, - "template": "show-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "n_iter" + }, + { + "name": "n_seconds_sleep" + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {} }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "default": "100", + "name": "n_iter" + }, + { + "default": "10", + "name": "n_seconds_sleep" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": null, - "value_from": null + "name": "duration" } ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0" + } }, - "metrics": null, - "name": "convert-to-artifact", - "node_selector": null, + "name": "torch-ddp-0", "outputs": { - "artifacts": [ + "parameters": [ { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "name": "duration", + "value_from": { + "path": "duration" + } } - ], - "exit_code": null, - "parameters": null, - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP" + } + ], "resources": { "limits": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" }, "requests": { "cpu": "100m", - "memory": "100Mi" + "memory": "300Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": [ + "parameters": [ + { + "default": "100", + "name": "n_iter" + }, { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null + "default": "10", + "name": "n_seconds_sleep" + }, + { + "default": "null", + "name": "duration" } - ], - "parameters": null + ] }, - "memoize": null, "metadata": { - "annotations": null, - "labels": null + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1" + } }, - "metrics": null, - "name": "show-artifact", - "node_selector": null, + "name": "torch-ddp-1", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null + "parameters": [ + { + "name": "duration", + "value_from": { + "path": "duration" + } + } + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "env": [ + { + "name": "NCCL_DEBUG", + "value": "INFO" + }, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2" + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local" + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200" + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1" + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0" + } + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "300Mi" + }, + "requests": { + "cpu": "100m", + "memory": "300Mi" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)" + } + }, + { + "inputs": { + "parameters": [ + { + "name": "a" + } + ] + }, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" + }, + "script": { + "command": [ + "python" + ], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -1355,44 +1238,46 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}" + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-artifact-pipeline-d5rzf" + "name": "pipeline-test-torch-cpu-pipeline-hgcxv" } }, - "synchronization": null, "task_results_completion_status": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": true, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": true + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1599494172": true, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-1742334825": true, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2373051150": true, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-2872643249": true, + "pipeline-test-torch-cpu-pipeline-hgcxv-flow-cbbn9-59759508": true } } } \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_2.json b/data_models/workflows/hera/hera_workflow_2.json index d11a081..eaa5627 100644 --- a/data_models/workflows/hera/hera_workflow_2.json +++ b/data_models/workflows/hera/hera_workflow_2.json @@ -1,19 +1,15 @@ { - "api_version": null, - "kind": null, "metadata": { "annotations": { "karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2" }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-", - "generation": 12, + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-parameter-pipeline-c877j-flow-", + "generation": 7, "labels": { + "bettmensch.ai/pipeline-id": "d2715290-865d-4776-84c4-776632cd7159", + "bettmensch.ai/pipeline-name": "pipeline-test-parameter-pipeline-c877j", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded" @@ -25,8 +21,7 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" }, { "api_version": "argoproj.io/v1alpha1", @@ -34,3628 +29,711 @@ "fields_v1": {}, "manager": "workflow-controller", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "namespace": "argo", - "owner_references": null, - "resource_version": "17861", - "self_link": null, - "uid": "d48f4d8d-61b1-4b86-a200-49c525c6f516" + "resource_version": "8018", + "uid": "f4623367-e5c2-4ba7-9a7a-633c55314421" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "-100" + }, + { + "name": "b", + "value": "100" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2" + "name": "pipeline-test-parameter-pipeline-c877j" } }, "status": { "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null + "not_specified": true }, "artifact_repository_ref": { "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, "s3": { - "access_key_secret": null, "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, "endpoint": "s3.us-east-2.amazonaws.com", "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}" } }, "config_map": "artifact-repositories", - "default": null, "key": "bettmensch-ai-artifact-repository", "namespace": "argo" }, - "compressed_nodes": null, "conditions": [ { - "message": null, "status": "False", "type": "PodRunning" }, { - "message": null, "status": "True", "type": "Completed" } ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, + "finished_at": "07/12/2024", "nodes": { - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d": { - "boundary_id": null, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn": { "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919" + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891" ], - "daemoned": null, - "display_name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "node_flag": null, + "display_name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "finished_at": "07/12/2024", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", "outbound_nodes": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476" + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" ], - "outputs": null, "phase": "Succeeded", - "pod_ip": null, - "progress": "7/7", + "progress": "2/2", "resources_duration": { - "cpu": 128, - "memory": 2228, - "nvidia.com/gpu": 179 + "cpu": 2, + "memory": 54 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "type": "DAG" }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-1(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-49-51.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660", + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323" + ], + "display_name": "bettmensch-ai-inner-dag", + "finished_at": "07/12/2024", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "-100" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "b", + "value": "100" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-1(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" + ], "outputs": { - "artifacts": null, - "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:23.332028", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } + "name": "sum", + "value": "2" } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", + "progress": "2/2", "resources_duration": { - "cpu": 34, - "memory": 587, - "nvidia.com/gpu": 48 + "cpu": 2, + "memory": 54 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-1", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", - "type": "Pod" + "type": "DAG" }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645" + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911" ], - "daemoned": null, - "display_name": "lightning-ddp-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-49-145.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233", + "display_name": "a-plus-b-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "1", + "name": "a", + "value": "-100" + }, + { + "default": "2", + "name": "b", + "value": "100" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0(0)", + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:23.295598", + "name": "sum", + "value": "0", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 26, - "memory": 467, - "nvidia.com/gpu": 37 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": null, - "daemoned": null, - "display_name": "show-duration-param-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0:00:23.295598", - "value_from": null - } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.show-duration-param-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { "cpu": 1, - "memory": 24 + "memory": 28 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "a-plus-b", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233" + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" ], - "daemoned": null, - "display_name": "lightning-ddp-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050", + "display_name": "a-plus-b-plus-2-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "1", + "name": "a", + "value": "0" + }, + { + "default": "2", + "name": "b", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0", "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:23.295598", + "name": "sum", + "value": "2", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "3/3", - "resources_duration": { - "cpu": 27, - "memory": 491, - "nvidia.com/gpu": 37 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-delete-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-delete-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 0, - "memory": 0 + "cpu": 1, + "memory": 26 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-delete-torch-service", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "a-plus-b-plus-2", "template_scope": "local/", - "type": "Pod" + "type": "Retry" }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476" + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278" ], - "daemoned": null, - "display_name": "show-duration-param-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3009250645", + "display_name": "a-plus-b-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323", "inputs": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, + "default": "1", "name": "a", - "value": "0:00:23.295598", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.show-duration-param-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-3(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-29.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506", - "inputs": { - "artifacts": null, - "parameters": [ + "value": "-100" + }, { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "2", + "name": "b", + "value": "100" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-3(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0", "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:22.990339", + "name": "sum", + "value": "0", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", + "progress": "2/2", "resources_duration": { - "cpu": 36, - "memory": 606, - "nvidia.com/gpu": 50 + "cpu": 2, + "memory": 54 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-3", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "a-plus-b", "template_scope": "local/", - "type": "Pod" + "type": "Retry" }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-2(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-166.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951", + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "display_name": "a-plus-b-plus-2-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "1", + "name": "a", + "value": "0" + }, + { + "default": "2", + "name": "b", + "value": "2" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-2(0)", + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:22.838134", + "name": "sum", + "value": "2", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 31, - "memory": 544, - "nvidia.com/gpu": 44 + "cpu": 1, + "memory": 26 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-2", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "a-plus-b-plus-2", "template_scope": "local/", "type": "Pod" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-2", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196", + } + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": { + "cpu": 2, + "memory": 54 + }, + "started_at": "07/12/2024", + "stored_templates": { + "namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b": { "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "1", + "name": "a" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-2", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:22.838134", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 31, - "memory": 544, - "nvidia.com/gpu": 44 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-3", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-3", - "node_flag": null, - "outbound_nodes": null, + "metadata": {}, + "name": "a-plus-b", "outputs": { - "artifacts": null, - "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:22.990339", + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 606, - "nvidia.com/gpu": 50 + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" + "script": { + "command": [ + "python" + ], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "100Mi" + }, + "requests": { + "cpu": "100m", + "memory": "100Mi" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-1", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053", + "namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b-plus-2": { "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "default": "1", + "name": "a" + }, + { + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "sum" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-0-worker-1", - "node_flag": null, - "outbound_nodes": null, + "metadata": {}, + "name": "a-plus-b-plus-2", "outputs": { - "artifacts": null, - "exit_code": "0", "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:23.332028", + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 34, - "memory": 587, - "nvidia.com/gpu": 48 + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" + "script": { + "command": [ + "python" + ], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "100Mi" + }, + "requests": { + "cpu": "100m", + "memory": "100Mi" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } }, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919": { - "boundary_id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d", - "children": [ - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-905808053", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-855475196", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-872252815", - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2032602050" - ], - "daemoned": null, - "display_name": "lightning-ddp-create-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d.lightning-ddp-create-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-create-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "7/7", - "resources_duration": { - "cpu": 128, - "memory": 2228, - "nvidia.com/gpu": 179 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, + "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-inner-dag": { "dag": { - "fail_fast": null, - "target": null, "tasks": [ { "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ + "name": "a", + "value": "{{inputs.parameters.a}}" + }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null + "name": "b", + "value": "{{inputs.parameters.b}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "name": "a-plus-b-0", + "template": "a-plus-b" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" + }, + { + "name": "b", + "value": "2" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-0": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "name": "a", + "value": "1" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "b", + "value": "2" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "0" - } - }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-1": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" - } - }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-3": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-create-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/lightning-ddp-delete-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + } }, - "namespaced/pipeline-test-lightning-gpu-pipeline-9r6h2/show-duration-param": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ + "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } }, "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "-100" + }, + { + "name": "b", + "value": "100" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, + "entrypoint": "bettmensch-ai-outer-dag", "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n namespace: argo\n labels:\n app: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, + "tasks": [ { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ + "name": "a", + "value": "{{inputs.parameters.a}}" + }, { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null + "name": "b", + "value": "{{inputs.parameters.b}}" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "name": "a-plus-b-0", + "template": "a-plus-b" }, { "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}" + }, + { + "name": "b", + "value": "2" } ] }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2" } ] }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "name": "a", + "value": "1" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "b", + "value": "2" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "0" - } - }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null + ] + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "default": "1", + "name": "a" }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "sum" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "2" - } - }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, + "metadata": {}, + "name": "a-plus-b", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "100Mi" }, "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" + "cpu": "100m", + "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "default": "1", + "name": "a" + }, + { + "default": "2", + "name": "b" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "sum" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, + "metadata": {}, + "name": "a-plus-b-plus-2", "outputs": { - "artifacts": null, - "exit_code": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, + "name": "sum", "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "path": "sum" } } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-3278f52c-b445-42e4-8e6e-ad2e351afcc8.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -3666,49 +744,43 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-lightning-gpu-pipeline-9r6h2" + "name": "pipeline-test-parameter-pipeline-c877j" } }, - "synchronization": null, "task_results_completion_status": { - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1639120660": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1697154233": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-1820439476": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-2871044736": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3164367506": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-3295920951": true, - "pipeline-test-lightning-gpu-pipeline-9r6h2-flow-c7v5d-966953919": true + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": true, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": true } } } \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_3.json b/data_models/workflows/hera/hera_workflow_3.json index 6e767d7..a1daca1 100644 --- a/data_models/workflows/hera/hera_workflow_3.json +++ b/data_models/workflows/hera/hera_workflow_3.json @@ -1,19 +1,15 @@ { - "api_version": null, - "kind": null, "metadata": { "annotations": { "karpenter.sh/do-not-disrupt": "true", "workflows.argoproj.io/pod-name-format": "v2" }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-", - "generation": 18, + "creation_timestamp": "07/12/2024", + "generate_name": "pipeline-test-artifact-pipeline-jx7pb-flow-", + "generation": 7, "labels": { + "bettmensch.ai/pipeline-id": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92", + "bettmensch.ai/pipeline-name": "pipeline-test-artifact-pipeline-jx7pb", "workflows.argoproj.io/completed": "true", "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", "workflows.argoproj.io/phase": "Succeeded" @@ -25,8 +21,7 @@ "fields_v1": {}, "manager": "argo", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" }, { "api_version": "argoproj.io/v1alpha1", @@ -34,4804 +29,634 @@ "fields_v1": {}, "manager": "workflow-controller", "operation": "Update", - "subresource": null, - "time": "test-datetime-value" + "time": "07/12/2024" } ], - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "namespace": "argo", - "owner_references": null, - "resource_version": "16194", - "self_link": null, - "uid": "bfe2cd60-7fa7-48ba-96f1-0845dbc142a8" + "resource_version": "7987", + "uid": "e7dd825f-1f8c-4bdf-87ca-b38ae6cd773c" }, "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, "arguments": { - "artifacts": null, "parameters": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "First integration test value a" } ] }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk" + "name": "pipeline-test-artifact-pipeline-jx7pb" } }, "status": { "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null + "not_specified": true }, "artifact_repository_ref": { "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, "s3": { - "access_key_secret": null, "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, "endpoint": "s3.us-east-2.amazonaws.com", "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}" } }, "config_map": "artifact-repositories", - "default": null, "key": "bettmensch-ai-artifact-repository", "namespace": "argo" }, - "compressed_nodes": null, "conditions": [ { - "message": null, "status": "False", "type": "PodRunning" }, { - "message": null, "status": "True", "type": "Completed" } ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, + "finished_at": "07/12/2024", "nodes": { - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq": { - "boundary_id": null, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d": { "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449" + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876" ], - "daemoned": null, - "display_name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "node_flag": null, + "display_name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "finished_at": "07/12/2024", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "outbound_nodes": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979" + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" ], - "outputs": null, "phase": "Succeeded", - "pod_ip": null, - "progress": "9/10", + "progress": "2/2", "resources_duration": { - "cpu": 235, - "memory": 4168 + "cpu": 2, + "memory": 68 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-outer-dag", "template_scope": "local/", "type": "DAG" }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-2(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593", + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741" + ], + "display_name": "convert-to-artifact-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "First integration test value a" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "a_art", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-2(0)", + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.986543", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "a_art", + "path": "a_art", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" } } ], - "result": null + "exit_code": "0" }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 52, - "memory": 898 + "cpu": 1, + "memory": 43 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-2", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "convert-to-artifact", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-3", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813", + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "display_name": "show-artifact-0(0)", + "finished_at": "07/12/2024", + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188", "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-3", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.132619", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "a", + "path": "a", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" } } ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 39, - "memory": 684 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-1(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762", - "inputs": { - "artifacts": null, "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "b", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-1(0)", + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0(0)", "node_flag": { - "hooked": null, "retried": true }, - "outbound_nodes": null, "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.512020", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" } } ], - "result": null + "exit_code": "0" }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { - "cpu": 52, - "memory": 899 + "cpu": 1, + "memory": 25 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-1", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "show-artifact", "template_scope": "local/", "type": "Pod" }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-4(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-203.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291", + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" + ], + "display_name": "show-artifact-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741", "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-4(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.044815", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "a", + "path": "a", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" } } ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 36, - "memory": 633 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-4", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-5(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-142.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230", - "inputs": { - "artifacts": null, "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "b", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-5(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0", "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.170848", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" } } ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 28, - "memory": 519 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-5", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "show-duration-param-0(1)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0:00:20.968705", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0(1)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null + "exit_code": "0" }, "phase": "Succeeded", - "pod_ip": null, "progress": "1/1", "resources_duration": { "cpu": 1, "memory": 25 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-delete-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-142.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-delete-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-delete-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-23383813", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356" - ], - "daemoned": null, - "display_name": "lightning-ddp-create-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-create-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-create-torch-service", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "show-artifact", "template_scope": "local/", - "type": "Pod" + "type": "Retry" }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871" + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518" ], - "daemoned": null, - "display_name": "lightning-ddp-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4087624356", + "display_name": "convert-to-artifact-0", + "finished_at": "07/12/2024", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a", + "value": "First integration test value a" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "a_art", + "value": "null" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0", "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.968705", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "a_art", + "path": "a_art", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" } } ], - "result": null + "exit_code": "0" }, "phase": "Succeeded", - "pod_ip": null, - "progress": "3/4", + "progress": "2/2", "resources_duration": { - "cpu": 28, - "memory": 535 + "cpu": 2, + "memory": 68 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-0", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "convert-to-artifact", "template_scope": "local/", "type": "Retry" }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291" + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243" ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-4", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4200907776", + "display_name": "bettmensch-ai-inner-dag", + "finished_at": "07/12/2024", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "a", + "value": "First integration test value a" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-4", - "node_flag": null, - "outbound_nodes": null, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" + ], "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.044815", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" } } - ], - "result": null + ] }, "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", + "progress": "2/2", "resources_duration": { - "cpu": 36, - "memory": 633 + "cpu": 2, + "memory": 68 }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-4", - "template_ref": null, + "started_at": "07/12/2024", + "template_name": "bettmensch-ai-inner-dag", "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499" - ], - "daemoned": null, - "display_name": "lightning-ddp-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-149.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871", - "inputs": { - "artifacts": null, - "parameters": [ + "type": "DAG" + } + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": { + "cpu": 2, + "memory": 68 + }, + "started_at": "07/12/2024", + "stored_templates": { + "namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-inner-dag": { + "dag": { + "tasks": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}" + } + ] + }, + "name": "convert-to-artifact-0", + "template": "convert-to-artifact" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "arguments": { + "artifacts": [ + { + "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", + "name": "a" + } + ] + }, + "depends": "convert-to-artifact-0", + "name": "show-artifact-0", + "template": "show-artifact" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.968705", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 27, - "memory": 510 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-5", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4217685395", "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "a", + "value": "Param A" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-5", - "node_flag": null, - "outbound_nodes": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.170848", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b" } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 28, - "memory": 519 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-5", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" + ] + } }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-1", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4284795871", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, + "namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-1", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.512020", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 899 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593" - ], - "daemoned": null, - "display_name": "lightning-ddp-0-worker-2", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-6606194", + "namespaced/pipeline-test-artifact-pipeline-jx7pb/convert-to-artifact": { "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null + "name": "a" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "a_art" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-2", - "node_flag": null, - "outbound_nodes": null, + "metadata": {}, + "name": "convert-to-artifact", "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:20.986543", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } + "name": "a_art", + "path": "a_art" } - ], - "result": null + ] }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 52, - "memory": 898 + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError" }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" + "script": { + "command": [ + "python" + ], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "100Mi" + }, + "requests": { + "cpu": "100m", + "memory": "100Mi" + } + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n" + } }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "lightning-ddp-0-worker-3(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-49-32.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340", + "namespaced/pipeline-test-artifact-pipeline-jx7pb/show-artifact": { "inputs": { - "artifacts": null, - "parameters": [ + "artifacts": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - }, + "name": "a", + "path": "a" + } + ], + "parameters": [ { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null + "name": "b" } ] }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.lightning-ddp-0-worker-3(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, + "metadata": {}, + "name": "show-artifact", "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "0:00:21.132619", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } + "name": "b", + "path": "b" } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 39, - "memory": 684 + ] }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "lightning-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": [ - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678", - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979" - ], - "daemoned": null, - "display_name": "show-duration-param-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-932828499", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0:00:20.968705", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/2", - "resources_duration": { - "cpu": 1, - "memory": 25 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678": { - "boundary_id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq", - "children": null, - "daemoned": null, - "display_name": "show-duration-param-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-149.us-east-2.compute.internal", - "id": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0:00:20.968705", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": "pod deleted", - "name": "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq.show-duration-param-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": null, - "phase": "Error", - "pod_ip": null, - "progress": "0/1", - "resources_duration": null, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "9/10", - "resources_duration": { - "cpu": 235, - "memory": 4168 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-4", - "on_exit": null, - "template": "lightning-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-5", - "on_exit": null, - "template": "lightning-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-0": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-1": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-3": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-4": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "metrics": null, - "name": "lightning-ddp-4", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-5": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "metrics": null, - "name": "lightning-ddp-5", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-create-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/lightning-ddp-delete-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-lightning-cpu-pipeline-c8drk/show-duration-param": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "00:00:00:20", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n namespace: argo\n labels:\n app: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "lightning-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "lightning-ddp-create-torch-service", - "on_exit": null, - "template": "lightning-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0", - "on_exit": null, - "template": "lightning-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-1", - "on_exit": null, - "template": "lightning-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-2", - "on_exit": null, - "template": "lightning-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-3", - "on_exit": null, - "template": "lightning-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-4", - "on_exit": null, - "template": "lightning-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": "{{workflow.parameters.max_time}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "lightning-ddp-0-worker-5", - "on_exit": null, - "template": "lightning-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "lightning-ddp-delete-torch-service", - "on_exit": null, - "template": "lightning-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.lightning-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "lightning-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "0" - } - }, - "metrics": null, - "name": "lightning-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "1" - } - }, - "metrics": null, - "name": "lightning-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } + "limits": { + "cpu": "100m", + "memory": "100Mi" }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "requests": { + "cpu": "100m", + "memory": "100Mi" + } }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n" + } + } + }, + "stored_workflow_template_spec": { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "First integration test value a" + } + ] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "service_account_name": "argo-workflow", + "templates": [ { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ + "dag": { + "tasks": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}" + } + ] + }, + "name": "convert-to-artifact-0", + "template": "convert-to-artifact" }, { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "arguments": { + "artifacts": [ + { + "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", + "name": "a" + } + ] + }, + "depends": "convert-to-artifact-0", + "name": "show-artifact-0", + "template": "show-artifact" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "2" - } - }, - "metrics": null, - "name": "lightning-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "a", + "value": "Param A" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "3" - } - }, - "metrics": null, - "name": "lightning-ddp-3", - "node_selector": null, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, + "artifacts": [ { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b" } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + ] + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, "parameters": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null + "name": "a" }, { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "a_art" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "4" - } - }, - "metrics": null, - "name": "lightning-ddp-4", - "node_selector": null, + "metadata": {}, + "name": "convert-to-artifact", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } + "name": "a_art", + "path": "a_art" } - ], - "result": null + ] }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { - "cpu": "700m", - "memory": "1Gi" + "cpu": "100m", + "memory": "100Mi" }, "requests": { - "cpu": "700m", - "memory": "1Gi" + "cpu": "100m", + "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n" + } }, { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, "inputs": { - "artifacts": null, - "parameters": [ + "artifacts": [ { - "default": "00:00:00:30", - "description": null, - "enum": null, - "global_name": null, - "name": "max_time", - "value": null, - "value_from": null - }, + "name": "a", + "path": "a" + } + ], + "parameters": [ { "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null + "name": "b" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e", - "torch-node": "5" - } - }, - "metrics": null, - "name": "lightning-ddp-5", - "node_selector": null, + "metadata": {}, + "name": "show-artifact", "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "lightning-ddp-0-6dfeb612-01b3-40f6-b40c-64eb9cc9eb6e.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-lightning:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "700m", - "memory": "1Gi" - }, - "requests": { - "cpu": "700m", - "memory": "1Gi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: max_time = json.loads(r'''{{inputs.parameters.max_time}}''')\nexcept: max_time = r'''{{inputs.parameters.max_time}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef lightning_ddp(max_time: InputParameter='00:00:00:30', duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n from datetime import datetime as dt\n import lightning.pytorch as pl\n import torch\n from bettmensch_ai.components.torch_utils import LaunchConfigSettings\n from lightning.pytorch.strategies import DDPStrategy\n start = dt.now()\n\n class ToyExample(pl.LightningModule):\n\n def __init__(self, model):\n super().__init__()\n self.model = model\n\n def training_step(self, batch):\n loss = self.model(batch).sum()\n return loss\n\n def configure_optimizers(self):\n return torch.optim.Adam(self.model.parameters())\n model = torch.nn.Linear(32, 2)\n pl_module = ToyExample(model)\n train_dataloader = torch.utils.data.DataLoader(torch.randn(8, 32))\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n process_group_backend = 'nccl' if has_gpu else 'gloo'\n accelerator = 'gpu' if has_gpu else 'cpu'\n ddp = DDPStrategy(process_group_backend=process_group_backend)\n launch_settings = LaunchConfigSettings()\n trainer = pl.Trainer(strategy=ddp, accelerator=accelerator, num_nodes=launch_settings.max_nodes, devices=launch_settings.nproc_per_node, max_time=max_time)\n trainer.fit(pl_module, train_dataloader)\n if duration is not None:\n duration.assign(dt.now() - start)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(lightning_ddp)\n\ntorch_distributed_function(max_time,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ + "artifacts": [ { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null + "name": "b", + "path": "b" } ] }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, "limit": "1", "retry_policy": "OnError" }, - "scheduler_name": null, "script": { - "args": null, "command": [ "python" ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, "name": "", - "ports": null, - "readiness_probe": null, "resources": { "limits": { "cpu": "100m", @@ -4842,52 +667,39 @@ "memory": "100Mi" } }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact(\"a\")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact(\"b\")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\n with open(b.path, 'w') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n" + } + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}" + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag" + } + ] }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {} } ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-lightning-cpu-pipeline-c8drk" + "name": "pipeline-test-artifact-pipeline-jx7pb" } }, - "synchronization": null, "task_results_completion_status": { - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-1557279593": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-2520177762": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3039208291": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3550627230": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3551413979": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3659131042": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-3979811449": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-4212313871": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-888842340": true, - "pipeline-test-lightning-cpu-pipeline-c8drk-flow-5v7qq-934355678": true + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": true, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": true } } } \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_4.json b/data_models/workflows/hera/hera_workflow_4.json deleted file mode 100644 index 653c15b..0000000 --- a/data_models/workflows/hera/hera_workflow_4.json +++ /dev/null @@ -1,3948 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-", - "generation": 13, - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "namespace": "argo", - "owner_references": null, - "resource_version": "13587", - "self_link": null, - "uid": "93098e5d-b8fe-4e2a-83d8-e19b7489c980" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8" - } - }, - "status": { - "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, - "s3": { - "access_key_secret": null, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - } - }, - "config_map": "artifact-repositories", - "default": null, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo" - }, - "compressed_nodes": null, - "conditions": [ - { - "message": null, - "status": "False", - "type": "PodRunning" - }, - { - "message": null, - "status": "True", - "type": "Completed" - } - ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, - "nodes": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx": { - "boundary_id": null, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784" - ], - "daemoned": null, - "display_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "node_flag": null, - "outbound_nodes": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811" - ], - "outputs": null, - "phase": "Succeeded", - "pod_ip": null, - "progress": "7/7", - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, - "template_scope": "local/", - "type": "DAG" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-delete-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-delete-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-delete-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759" - ], - "daemoned": null, - "display_name": "show-duration-param-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "60", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-1(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-242.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-3", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-2", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811" - ], - "daemoned": null, - "display_name": "torch-ddp-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-49-47.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 15, - "memory": 1112, - "nvidia.com/gpu": 138 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-3(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-49-43.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-1", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-2(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-184.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488" - ], - "daemoned": null, - "display_name": "torch-ddp-create-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-create-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-create-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662" - ], - "daemoned": null, - "display_name": "torch-ddp-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "3/3", - "resources_duration": { - "cpu": 16, - "memory": 1135, - "nvidia.com/gpu": 138 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": null, - "daemoned": null, - "display_name": "show-duration-param-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "60", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "7/7", - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/show-duration-param": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-0": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-1": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-3": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-create-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-delete-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": null, - "value": null - } - ], - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8" - } - }, - "synchronization": null, - "task_results_completion_status": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": true, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_5.json b/data_models/workflows/hera/hera_workflow_5.json deleted file mode 100644 index b3846f0..0000000 --- a/data_models/workflows/hera/hera_workflow_5.json +++ /dev/null @@ -1,5190 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-", - "generation": 11, - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "namespace": "argo", - "owner_references": null, - "resource_version": "11623", - "self_link": null, - "uid": "c085649e-4392-4616-b1fd-2e553aebd469" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx" - } - }, - "status": { - "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, - "s3": { - "access_key_secret": null, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - } - }, - "config_map": "artifact-repositories", - "default": null, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo" - }, - "compressed_nodes": null, - "conditions": [ - { - "message": null, - "status": "False", - "type": "PodRunning" - }, - { - "message": null, - "status": "True", - "type": "Completed" - } - ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, - "nodes": { - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd": { - "boundary_id": null, - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175" - ], - "daemoned": null, - "display_name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "node_flag": null, - "outbound_nodes": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112" - ], - "outputs": null, - "phase": "Succeeded", - "pod_ip": null, - "progress": "9/9", - "resources_duration": { - "cpu": 105, - "memory": 3878 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, - "template_scope": "local/", - "type": "DAG" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561" - ], - "daemoned": null, - "display_name": "torch-ddp-create-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-create-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-create-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-4", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1282628942", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-4", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 669 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-4", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-5", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1299406561", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-5", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-5", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-2", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1316184180", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-2", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 657 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-3", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1332961799", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-3", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "show-duration-param-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "60", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.show-duration-param-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524" - ], - "daemoned": null, - "display_name": "torch-ddp-0-worker-1", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1366517037", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-1", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-3(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-3(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-3", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537" - ], - "daemoned": null, - "display_name": "torch-ddp-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-2818153322", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "3/3", - "resources_duration": { - "cpu": 19, - "memory": 689 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-2(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-2(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 657 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-2", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-1(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-1(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-1", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229", - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112" - ], - "daemoned": null, - "display_name": "torch-ddp-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 665 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-0", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-delete-torch-service", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-delete-torch-service", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 0, - "memory": 0 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-delete-torch-service", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": [ - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924" - ], - "daemoned": null, - "display_name": "show-duration-param-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3763294229", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "60", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.show-duration-param-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-duration-param", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-5(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-50-203.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-5(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 17, - "memory": 621 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-5", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341": { - "boundary_id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd", - "children": null, - "daemoned": null, - "display_name": "torch-ddp-0-worker-4(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd.torch-ddp-0-worker-4(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 18, - "memory": 669 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "torch-ddp-4", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "9/9", - "resources_duration": { - "cpu": 105, - "memory": 3878 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-4", - "on_exit": null, - "template": "torch-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-5", - "on_exit": null, - "template": "torch-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/show-duration-param": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-0": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-1": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-3": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-4": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "metrics": null, - "name": "torch-ddp-4", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-5": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "metrics": null, - "name": "torch-ddp-5", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-create-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-torch-cpu-pipeline-2n6rx/torch-ddp-delete-torch-service": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "12", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "5", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-create-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "create", - "failure_condition": null, - "flags": null, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n namespace: argo\n labels:\n app: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "torch-ddp-delete-torch-service", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": { - "action": "delete", - "failure_condition": null, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "-n", - "argo" - ], - "manifest": null, - "manifest_from": null, - "merge_strategy": null, - "set_owner_reference": null, - "success_condition": null - }, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "torch-ddp-create-torch-service", - "on_exit": null, - "template": "torch-ddp-create-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0", - "on_exit": null, - "template": "torch-ddp-0", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-1", - "on_exit": null, - "template": "torch-ddp-1", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-2", - "on_exit": null, - "template": "torch-ddp-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-3", - "on_exit": null, - "template": "torch-ddp-3", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-4", - "on_exit": null, - "template": "torch-ddp-4", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-create-torch-service", - "hooks": null, - "inline": null, - "name": "torch-ddp-0-worker-5", - "on_exit": null, - "template": "torch-ddp-5", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "torch-ddp-delete-torch-service", - "on_exit": null, - "template": "torch-ddp-delete-torch-service", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "torch-ddp-0", - "hooks": null, - "inline": null, - "name": "show-duration-param-0", - "on_exit": null, - "template": "show-duration-param", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "0" - } - }, - "metrics": null, - "name": "torch-ddp-0", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": null, - "host_port": null, - "name": "ddp", - "protocol": "TCP" - } - ], - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "1" - } - }, - "metrics": null, - "name": "torch-ddp-1", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "2" - } - }, - "metrics": null, - "name": "torch-ddp-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "3" - } - }, - "metrics": null, - "name": "torch-ddp-3", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "4" - } - }, - "metrics": null, - "name": "torch-ddp-4", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "4", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "100", - "description": null, - "enum": null, - "global_name": null, - "name": "n_iter", - "value": null, - "value_from": null - }, - { - "default": "10", - "description": null, - "enum": null, - "global_name": null, - "name": "n_seconds_sleep", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": { - "torch-job": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2", - "torch-node": "5" - } - }, - "metrics": null, - "name": "torch-ddp-5", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "duration", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "duration", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "6", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "5", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-cf224844-8416-4e44-84d7-539997d748d2.argo.svc.cluster.local", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": null - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": null - } - ], - "env_from": null, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "300Mi" - }, - "requests": { - "cpu": "100m", - "memory": "300Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-duration-param", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-torch-cpu-pipeline-2n6rx" - } - }, - "synchronization": null, - "task_results_completion_status": { - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1117923175": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1352423924": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-1396147642": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3153917983": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3155590524": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3218331537": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-3570269112": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-4186039992": true, - "pipeline-test-torch-cpu-pipeline-2n6rx-flow-vgwzd-921081341": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_6.json b/data_models/workflows/hera/hera_workflow_6.json deleted file mode 100644 index 54beab2..0000000 --- a/data_models/workflows/hera/hera_workflow_6.json +++ /dev/null @@ -1,1478 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-parameter-pipeline-mhwgd-flow-", - "generation": 7, - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "namespace": "argo", - "owner_references": null, - "resource_version": "10156", - "self_link": null, - "uid": "8ba8d28a-5dd1-4234-a5e4-364ba12ab24b" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-100", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "100", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "status": { - "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, - "s3": { - "access_key_secret": null, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - } - }, - "config_map": "artifact-repositories", - "default": null, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo" - }, - "compressed_nodes": null, - "conditions": [ - { - "message": null, - "status": "False", - "type": "PodRunning" - }, - { - "message": null, - "status": "True", - "type": "Completed" - } - ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, - "nodes": { - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq": { - "boundary_id": null, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557" - ], - "daemoned": null, - "display_name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "node_flag": null, - "outbound_nodes": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140" - ], - "outputs": null, - "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, - "template_scope": "local/", - "type": "DAG" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140" - ], - "daemoned": null, - "display_name": "a-plus-b-plus-2-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0", - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-plus-2-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "2", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b-plus-2", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-2800207309" - ], - "daemoned": null, - "display_name": "a-plus-b-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-100", - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "100", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "0", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "children": null, - "daemoned": null, - "display_name": "a-plus-b-plus-2-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "0", - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-plus-2-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "2", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 23 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b-plus-2", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908" - ], - "daemoned": null, - "display_name": "a-plus-b-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-929032557", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-100", - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "100", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq.a-plus-b-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": "0", - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "a-plus-b", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "2/2", - "resources_duration": { - "cpu": 2, - "memory": 46 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "a-plus-b", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b-plus-2": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "a-plus-b-plus-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "a-plus-b-0", - "on_exit": null, - "template": "a-plus-b", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "a-plus-b-0", - "hooks": null, - "inline": null, - "name": "a-plus-b-plus-2-0", - "on_exit": null, - "template": "a-plus-b-plus-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "-100", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "100", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "a-plus-b-0", - "on_exit": null, - "template": "a-plus-b", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": null - }, - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": "2", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": "a-plus-b-0", - "hooks": null, - "inline": null, - "name": "a-plus-b-plus-2-0", - "on_exit": null, - "template": "a-plus-b-plus-2", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "a-plus-b", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": "1", - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "2", - "description": null, - "enum": null, - "global_name": null, - "name": "b", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "a-plus-b-plus-2", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "sum", - "value": null, - "value_from": { - "config_map_key_ref": null, - "default": null, - "event": null, - "expression": null, - "jq_filter": null, - "json_path": null, - "parameter": null, - "path": "sum", - "supplied": null - } - } - ], - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-parameter-pipeline-mhwgd" - } - }, - "synchronization": null, - "task_results_completion_status": { - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-3394894908": true, - "pipeline-test-parameter-pipeline-mhwgd-flow-khxzq-764234140": true - } - } -} \ No newline at end of file diff --git a/data_models/workflows/hera/hera_workflow_7.json b/data_models/workflows/hera/hera_workflow_7.json deleted file mode 100644 index 06640f0..0000000 --- a/data_models/workflows/hera/hera_workflow_7.json +++ /dev/null @@ -1,1398 +0,0 @@ -{ - "api_version": null, - "kind": null, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "true", - "workflows.argoproj.io/pod-name-format": "v2" - }, - "cluster_name": null, - "creation_timestamp": "test-datetime-value", - "deletion_grace_period_seconds": null, - "deletion_timestamp": null, - "finalizers": null, - "generate_name": "pipeline-test-artifact-pipeline-d5rzf-flow-", - "generation": 7, - "labels": { - "workflows.argoproj.io/completed": "true", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": null, - "time": "test-datetime-value" - } - ], - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "namespace": "argo", - "owner_references": null, - "resource_version": "9912", - "self_link": null, - "uid": "9948f727-967a-4905-800e-ec80117d8398" - }, - "spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "First integration test value a", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": null, - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": null, - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": null, - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "status": { - "artifact_gc_status": { - "not_specified": true, - "pods_recouped": null, - "strategies_processed": null - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": null, - "artifactory": null, - "azure": null, - "gcs": null, - "hdfs": null, - "oss": null, - "s3": { - "access_key_secret": null, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": true, - "key_format": null, - "key_prefix": null, - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - } - }, - "config_map": "artifact-repositories", - "default": null, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo" - }, - "compressed_nodes": null, - "conditions": [ - { - "message": null, - "status": "False", - "type": "PodRunning" - }, - { - "message": null, - "status": "True", - "type": "Completed" - } - ], - "estimated_duration": null, - "finished_at": "test-datetime-value", - "message": null, - "nodes": { - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k": { - "boundary_id": null, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103" - ], - "daemoned": null, - "display_name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "inputs": null, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "node_flag": null, - "outbound_nodes": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088" - ], - "outputs": null, - "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "bettmensch-ai-dag", - "template_ref": null, - "template_scope": "local/", - "type": "DAG" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "children": null, - "daemoned": null, - "display_name": "show-artifact-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088", - "inputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null - } - ], - "parameters": null - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.show-artifact-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-artifact", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882" - ], - "daemoned": null, - "display_name": "convert-to-artifact-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2149832103", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "First integration test value a", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.convert-to-artifact-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null - } - ], - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "2/2", - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "convert-to-artifact", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737" - ], - "daemoned": null, - "display_name": "convert-to-artifact-0(0)", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882", - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "First integration test value a", - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": "null", - "value_from": null - } - ] - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.convert-to-artifact-0(0)", - "node_flag": { - "hooked": null, - "retried": true - }, - "outbound_nodes": null, - "outputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null - } - ], - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 2, - "memory": 140 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "convert-to-artifact", - "template_ref": null, - "template_scope": "local/", - "type": "Pod" - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088" - ], - "daemoned": null, - "display_name": "show-artifact-0", - "estimated_duration": null, - "finished_at": "test-datetime-value", - "host_node_name": null, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-651241737", - "inputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": { - "access_key_secret": null, - "bucket": null, - "ca_secret": null, - "create_bucket_if_not_present": null, - "encryption_options": null, - "endpoint": null, - "insecure": null, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k/pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-convert-to-artifact-2691985882/a_art.tgz", - "region": null, - "role_arn": null, - "secret_key_secret": null, - "use_sdk_creds": null - }, - "sub_path": null - } - ], - "parameters": null - }, - "memoization_status": null, - "message": null, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k.show-artifact-0", - "node_flag": null, - "outbound_nodes": null, - "outputs": { - "artifacts": null, - "exit_code": "0", - "parameters": null, - "result": null - }, - "phase": "Succeeded", - "pod_ip": null, - "progress": "1/1", - "resources_duration": { - "cpu": 1, - "memory": 24 - }, - "started_at": "test-datetime-value", - "synchronization_status": null, - "template_name": "show-artifact", - "template_ref": null, - "template_scope": "local/", - "type": "Retry" - } - }, - "offload_node_status_version": null, - "outputs": null, - "persistent_volume_claims": null, - "phase": "Succeeded", - "progress": "2/2", - "resources_duration": { - "cpu": 3, - "memory": 164 - }, - "started_at": "test-datetime-value", - "stored_templates": { - "namespaced/pipeline-test-artifact-pipeline-d5rzf/bettmensch-ai-dag": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "convert-to-artifact-0", - "on_exit": null, - "template": "convert-to-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": null, - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "convert-to-artifact-0", - "hooks": null, - "inline": null, - "name": "show-artifact-0", - "on_exit": null, - "template": "show-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/convert-to-artifact": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "convert-to-artifact", - "node_selector": null, - "outputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/show-artifact": { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-artifact", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": null, - "affinity": null, - "archive_logs": null, - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "First integration test value a", - "value_from": null - } - ] - }, - "artifact_gc": null, - "artifact_repository_ref": null, - "automount_service_account_token": null, - "dns_config": null, - "dns_policy": null, - "entrypoint": "bettmensch-ai-dag", - "executor": null, - "hooks": null, - "host_aliases": null, - "host_network": null, - "image_pull_secrets": null, - "metrics": null, - "node_selector": null, - "on_exit": null, - "parallelism": null, - "pod_disruption_budget": null, - "pod_gc": null, - "pod_metadata": null, - "pod_priority": null, - "pod_priority_class_name": null, - "pod_spec_patch": null, - "priority": null, - "retry_strategy": null, - "scheduler_name": null, - "security_context": null, - "service_account_name": "argo-workflow", - "shutdown": null, - "suspend": null, - "synchronization": null, - "template_defaults": null, - "templates": [ - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": { - "fail_fast": null, - "target": null, - "tasks": [ - { - "arguments": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": null - } - ] - }, - "continue_on": null, - "dependencies": null, - "depends": null, - "hooks": null, - "inline": null, - "name": "convert-to-artifact-0", - "on_exit": null, - "template": "convert-to-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - }, - { - "arguments": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": null, - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "parameters": null - }, - "continue_on": null, - "dependencies": null, - "depends": "convert-to-artifact-0", - "hooks": null, - "inline": null, - "name": "show-artifact-0", - "on_exit": null, - "template": "show-artifact", - "template_ref": null, - "when": null, - "with_items": null, - "with_param": null, - "with_sequence": null - } - ] - }, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "bettmensch-ai-dag", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": null, - "scheduler_name": null, - "script": null, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": null, - "parameters": [ - { - "default": null, - "description": null, - "enum": null, - "global_name": null, - "name": "a", - "value": null, - "value_from": null - }, - { - "default": "null", - "description": null, - "enum": null, - "global_name": null, - "name": "a_art", - "value": null, - "value_from": null - } - ] - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "convert-to-artifact", - "node_selector": null, - "outputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a_art", - "optional": null, - "oss": null, - "path": "a_art", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - }, - { - "active_deadline_seconds": null, - "affinity": null, - "archive_location": null, - "automount_service_account_token": null, - "container": null, - "container_set": null, - "daemon": null, - "dag": null, - "data": null, - "executor": null, - "fail_fast": null, - "host_aliases": null, - "http": null, - "init_containers": null, - "inputs": { - "artifacts": [ - { - "archive": null, - "archive_logs": null, - "artifact_gc": null, - "artifactory": null, - "azure": null, - "deleted": null, - "from_": null, - "from_expression": null, - "gcs": null, - "git": null, - "global_name": null, - "hdfs": null, - "http": null, - "mode": null, - "name": "a", - "optional": null, - "oss": null, - "path": "a", - "raw": null, - "recurse_mode": null, - "s3": null, - "sub_path": null - } - ], - "parameters": null - }, - "memoize": null, - "metadata": { - "annotations": null, - "labels": null - }, - "metrics": null, - "name": "show-artifact", - "node_selector": null, - "outputs": { - "artifacts": null, - "exit_code": null, - "parameters": null, - "result": null - }, - "parallelism": null, - "plugin": null, - "pod_spec_patch": null, - "priority": null, - "priority_class_name": null, - "resource": null, - "retry_strategy": { - "affinity": null, - "backoff": null, - "expression": null, - "limit": "1", - "retry_policy": "OnError" - }, - "scheduler_name": null, - "script": { - "args": null, - "command": [ - "python" - ], - "env": null, - "env_from": null, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": null, - "liveness_probe": null, - "name": "", - "ports": null, - "readiness_probe": null, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi" - }, - "requests": { - "cpu": "100m", - "memory": "100Mi" - } - }, - "security_context": null, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact(\"a\")\n\ndef show_artifact(a: InputArtifact) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact.\"\"\"\n with open(a.path, 'r') as a_art_file:\n a_content = a_art_file.read()\n print(f'Content of input artifact a: {a_content}')\nshow_artifact(a)", - "startup_probe": null, - "stdin": null, - "stdin_once": null, - "termination_message_path": null, - "termination_message_policy": null, - "tty": null, - "volume_devices": null, - "volume_mounts": null, - "working_dir": null - }, - "security_context": null, - "service_account_name": null, - "sidecars": null, - "steps": null, - "suspend": null, - "synchronization": null, - "timeout": null, - "tolerations": null, - "volumes": null - } - ], - "tolerations": null, - "ttl_strategy": null, - "volume_claim_gc": null, - "volume_claim_templates": null, - "volumes": null, - "workflow_metadata": null, - "workflow_template_ref": { - "cluster_scope": null, - "name": "pipeline-test-artifact-pipeline-d5rzf" - } - }, - "synchronization": null, - "task_results_completion_status": { - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-1194847088": true, - "pipeline-test-artifact-pipeline-d5rzf-flow-5z44k-2691985882": true - } - } -} \ No newline at end of file diff --git a/docker/dashboard/src/1_pipelines.py b/docker/dashboard/src/1_pipelines.py index 99205f7..d9338ab 100644 --- a/docker/dashboard/src/1_pipelines.py +++ b/docker/dashboard/src/1_pipelines.py @@ -124,6 +124,7 @@ def get_formatted_pipeline_data( "object": {}, "metadata": {}, "inputs": {}, + "outputs": {}, "dag": {}, "templates": {}, } @@ -148,6 +149,9 @@ def get_formatted_pipeline_data( formatted_pipeline_data["inputs"][resource_name] = pipeline_dict[ "inputs" ] + formatted_pipeline_data["outputs"][resource_name] = pipeline_dict[ + "outputs" + ] formatted_pipeline_data["dag"][resource_name] = pipeline_dict[ "dag" ] # noqa: E501 @@ -159,8 +163,8 @@ def get_formatted_pipeline_data( except Exception as e: st.write( f"Oops! Could not collect data for Pipeline {resource_name}: " - f"{e} Please make sure the workflow template was created with " - "the bettmensch.ai SDK and was submitted successfully." + f"{e}. Please make sure the workflow template was created with" + " the bettmensch.ai SDK and was submitted successfully." ) return formatted_pipeline_data @@ -284,7 +288,13 @@ def display_pipeline_dag_selection( task_inputs_parameters_formatted_df = pd.concat( [ task_inputs_parameters_df.drop( - ["source", "value_from"], axis=1 + [ + "source", + "type", + "argument_type", + "value_from", + ], + axis=1, ), task_inputs_parameters_df["source"].apply( pd.Series @@ -295,9 +305,10 @@ def display_pipeline_dag_selection( columns={ "name": "Name", "value": "Default", - "node": "Upstream Task", - "output_name": "Upstream Output", - "output_type": "Upstream Type", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", # noqa: E501 + "io_name": "Upstream I/O", }, inplace=False, ) @@ -312,7 +323,12 @@ def display_pipeline_dag_selection( task_inputs_artifacts_formatted_df = pd.concat( [ task_inputs_artifacts_df.drop( - ["source"], axis=1 + [ + "source", + "type", + "argument_type", + ], + axis=1, ), task_inputs_artifacts_df["source"].apply( pd.Series @@ -322,9 +338,11 @@ def display_pipeline_dag_selection( ).rename( columns={ "name": "Name", - "node": "Upstream Task", - "output_name": "Upstream Output", - "output_type": "Upstream Type", + "value": "Default", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", # noqa: E501 + "io_name": "Upstream I/O", }, inplace=False, ) @@ -349,7 +367,7 @@ def display_pipeline_dag_selection( ) task_outputs_parameters_formatted_df = ( task_outputs_parameters_df.drop( - "value_from", axis=1 + ["value_from", "type", "argument_type"], axis=1 ).rename( columns={ "name": "Name", @@ -367,7 +385,7 @@ def display_pipeline_dag_selection( ) task_outputs_artifacts_formatted_df = ( task_outputs_artifacts_df.drop( - "path", axis=1 + ["path", "type", "argument_type"], axis=1 ).rename( columns={ "name": "Name", @@ -450,9 +468,16 @@ def display_selected_pipeline( f"{selected_pipeline}`" ) - tab_inputs, tab_metadata, tab_dag, tab_templates = st.tabs( + ( + tab_inputs, + tab_outputs, + tab_metadata, + tab_dag, + tab_templates, + ) = st.tabs( [ "Pipeline Inputs", + "Pipeline Outputs", "Pipeline Meta Data", "Pipeline DAG", "Pipeline Templates", @@ -463,18 +488,106 @@ def display_selected_pipeline( with st.container(height=tab_container_height, border=False): pipeline_inputs = formatted_pipeline_data["inputs"][ selected_pipeline - ] - pipeline_inputs_formatted_df = pd.DataFrame( - pipeline_inputs - ).rename( - columns={ - "name": "Name", - "value": "Default", - } + ]["parameters"] + pipeline_inputs_formatted_df = ( + pd.DataFrame(pipeline_inputs) + .drop(["type", "argument_type"], axis=1) + .rename( + columns={ + "name": "Name", + "value": "Default", + } + ) ) st.write(":page_with_curl: Parameters") st.dataframe(pipeline_inputs_formatted_df, hide_index=True) + with tab_outputs: + with st.container(height=tab_container_height, border=False): + # build pipeline outputs parameters table + pipeline_outputs_parameters = formatted_pipeline_data[ + "outputs" + ][selected_pipeline]["parameters"] + if pipeline_outputs_parameters: + pipeline_outputs_parameters_df = pd.DataFrame( + pipeline_outputs_parameters + ) + pipeline_outputs_parameters_formatted_df = pd.concat( + [ + pipeline_outputs_parameters_df.drop( + [ + "source", + "type", + "argument_type", + "value_from", + "value", + ], + axis=1, + ), + pipeline_outputs_parameters_df["source"].apply( + pd.Series + ), + ], + axis=1, + ).rename( + columns={ + "name": "Name", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", + "io_name": "Upstream I/O", + }, + inplace=False, + ) + else: + pipeline_outputs_parameters_formatted_df = pd.DataFrame() + + # build pipeline outputs artifact table + pipeline_outputs_artifacts = formatted_pipeline_data[ + "outputs" + ][selected_pipeline]["artifacts"] + if pipeline_outputs_artifacts: + pipeline_outputs_artifacts_df = pd.DataFrame( + pipeline_outputs_artifacts + ) + pipeline_outputs_artifacts_formatted_df = pd.concat( + [ + pipeline_outputs_artifacts_df.drop( + [ + "source", + "type", + "argument_type", + ], + axis=1, + ), + pipeline_outputs_artifacts_df["source"].apply( + pd.Series + ), + ], + axis=1, + ).rename( + columns={ + "name": "Name", + "value": "Default", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", + "io_name": "Upstream I/O", + }, + inplace=False, + ) + else: + pipeline_outputs_artifacts_formatted_df = pd.DataFrame() + + st.write(":page_with_curl: Parameters") + st.dataframe( + pipeline_outputs_parameters_formatted_df, hide_index=True + ) + st.write(":open_file_folder: Artifacts") + st.dataframe( + pipeline_outputs_artifacts_formatted_df, hide_index=True + ) + with tab_metadata: with st.container(height=tab_container_height, border=False): st.markdown("### Spec") diff --git a/docker/dashboard/src/2_flows.py b/docker/dashboard/src/2_flows.py index 1028483..ceee67b 100644 --- a/docker/dashboard/src/2_flows.py +++ b/docker/dashboard/src/2_flows.py @@ -127,6 +127,7 @@ def get_formatted_flow_data( "metadata": {}, "state": {}, "inputs": {}, + "outputs": {}, "dag": {}, "templates": {}, } @@ -146,17 +147,19 @@ def get_formatted_flow_data( ] formatted_flow_data["state"][resource_name] = flow_dict["state"] formatted_flow_data["inputs"][resource_name] = flow_dict["inputs"] + formatted_flow_data["outputs"][resource_name] = flow_dict[ + "outputs" + ] formatted_flow_data["dag"][resource_name] = flow_dict["dag"] formatted_flow_data["templates"][resource_name] = flow_dict[ "templates" ] except Exception as e: st.write( - f"Oops! Could not collect data for Flow {resource_name}: {e}" - "Please make sure the Argo Workflow was created with the " + f"Oops! Could not collect data for Flow {resource_name}: {e}." + " Please make sure the Argo Workflow was created with the " "bettmensch.ai SDK and was submitted successfully." ) - raise e return formatted_flow_data @@ -282,7 +285,13 @@ def display_flow_dag_selection( task_inputs_parameters_formatted_df = pd.concat( [ task_inputs_parameters_df.drop( - ["source", "value_from"], axis=1 + [ + "source", + "type", + "argument_type", + "value_from", + ], + axis=1, ), task_inputs_parameters_df["source"].apply( pd.Series @@ -293,9 +302,10 @@ def display_flow_dag_selection( columns={ "name": "Name", "value": "Value", - "node": "Upstream Task", - "output_name": "Upstream Output", - "output_type": "Upstream Type", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", # noqa: E501 + "io_name": "Upstream I/O", }, inplace=False, ) @@ -309,20 +319,31 @@ def display_flow_dag_selection( task_inputs_artifacts_formatted_df = pd.concat( [ task_inputs_artifacts_df.drop( - ["source"], axis=1 + [ + "source", + "type", + "argument_type", + "s3", + ], + axis=1, ), task_inputs_artifacts_df["source"].apply( pd.Series ), + task_inputs_artifacts_df["s3"].apply( + pd.Series + ), ], axis=1, ).rename( columns={ "name": "Name", - "s3_prefix": "S3 Prefix", - "node": "Upstream Task", - "output_name": "Upstream Output", - "output_type": "Upstream Type", + "bucket": "S3 Bucket", + "key": "S3 Prefix", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", # noqa: E501 + "io_name": "Upstream I/O", }, inplace=False, ) @@ -347,7 +368,7 @@ def display_flow_dag_selection( ) task_outputs_parameters_formatted_df = ( task_outputs_parameters_df.drop( - "value_from", axis=1 + ["value_from", "type", "argument_type"], axis=1 ).rename( columns={ "name": "Name", @@ -363,16 +384,24 @@ def display_flow_dag_selection( task_outputs_artifacts_df = pd.DataFrame( task["outputs"]["artifacts"] ) - task_outputs_artifacts_formatted_df = ( - task_outputs_artifacts_df.drop( - "path", axis=1 - ).rename( - columns={ - "name": "Name", - "s3_prefix": "S3 Prefix", - }, - inplace=False, - ) + task_outputs_artifacts_formatted_df = pd.concat( + [ + task_outputs_artifacts_df.drop( + ["path", "type", "argument_type", "s3"], + axis=1, + ), + task_outputs_artifacts_df["s3"].apply( + pd.Series + ), + ], + axis=1, + ).rename( + columns={ + "name": "Name", + "bucket": "S3 Bucket", + "key": "S3 Prefix", + }, + inplace=False, ) else: task_outputs_artifacts_formatted_df = pd.DataFrame() @@ -476,9 +505,16 @@ def display_selected_flow( with spec_col: st.markdown(f"### :arrow_forward: Flow: `{selected_flow}`") - tab_inputs, tab_metadata, tab_dag, tab_templates = st.tabs( + ( + tab_inputs, + tab_outputs, + tab_metadata, + tab_dag, + tab_templates, + ) = st.tabs( [ "Flow Inputs", + "Flow Outputs", "Flow Meta Data", "Flow DAG", "Flow Templates", @@ -487,16 +523,106 @@ def display_selected_flow( with tab_inputs: with st.container(height=tab_container_height, border=False): - flow_inputs = formatted_flow_data["inputs"][selected_flow] - flow_inputs_formatted_df = pd.DataFrame(flow_inputs).rename( - columns={ - "name": "Name", - "value": "Value", - } + flow_inputs = formatted_flow_data["inputs"][selected_flow][ + "parameters" + ] + flow_inputs_formatted_df = ( + pd.DataFrame(flow_inputs) + .drop(["type", "argument_type"], axis=1) + .rename( + columns={ + "name": "Name", + "value": "Value", + } + ) ) st.write(":page_with_curl: Parameters") st.dataframe(flow_inputs_formatted_df, hide_index=True) + with tab_outputs: + with st.container(height=tab_container_height, border=False): + # build pipeline outputs parameters table + flow_outputs_parameters = formatted_flow_data["outputs"][ + selected_flow + ]["parameters"] + if flow_outputs_parameters: + flow_outputs_parameters_df = pd.DataFrame( + flow_outputs_parameters + ) + flow_outputs_parameters_formatted_df = pd.concat( + [ + flow_outputs_parameters_df.drop( + [ + "source", + "type", + "argument_type", + "value_from", + "value", + ], + axis=1, + ), + flow_outputs_parameters_df["source"].apply( + pd.Series + ), + ], + axis=1, + ).rename( + columns={ + "name": "Name", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", + "io_name": "Upstream I/O", + }, + inplace=False, + ) + else: + flow_outputs_parameters_formatted_df = pd.DataFrame() + + # build pipeline outputs artifact table + flow_outputs_artifacts = formatted_flow_data["outputs"][ + selected_flow + ]["artifacts"] + if flow_outputs_artifacts: + flow_outputs_artifacts_df = pd.DataFrame( + flow_outputs_artifacts + ) + flow_outputs_artifacts_formatted_df = pd.concat( + [ + flow_outputs_artifacts_df.drop( + ["source", "type", "argument_type", "s3"], + axis=1, + ), + flow_outputs_artifacts_df["source"].apply( + pd.Series + ), + flow_outputs_artifacts_df["s3"].apply(pd.Series), + ], + axis=1, + ).rename( + columns={ + "name": "Name", + "bucket": "S3 Bucket", + "key": "S3 Prefix", + "node_name": "Upstream Task", + "io_type": "Upstream I/O type", + "io_argument_type": "Upstream I/O Argument Type", + "io_name": "Upstream I/O", + }, + inplace=False, + ) + else: + flow_outputs_artifacts_formatted_df = pd.DataFrame() + + st.write(":page_with_curl: Parameters") + st.dataframe( + flow_outputs_parameters_formatted_df, hide_index=True + ) + st.write(":open_file_folder: Artifacts") + st.dataframe( + flow_outputs_artifacts_formatted_df, hide_index=True + ) + with tab_metadata: with st.container(height=tab_container_height, border=False): st.markdown("### Spec") diff --git a/sdk/bettmensch_ai/server/flow.py b/sdk/bettmensch_ai/server/flow.py index c2c9857..1a100ed 100644 --- a/sdk/bettmensch_ai/server/flow.py +++ b/sdk/bettmensch_ai/server/flow.py @@ -1,18 +1,27 @@ from __future__ import annotations from datetime import datetime -from typing import Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Tuple, Type, Union from bettmensch_ai.server.pipeline import ( - NodeInput, - NodeOutput, + NodeArtifactInput, + NodeArtifactOutput, + NodeInputs, + NodeOutputs, + NodeParameterInput, + NodeParameterOutput, Pipeline, - PipelineInputParameter, + PipelineInputs, + PipelineNode, + PipelineOutputs, + PipelineParameterInput, ResourceTemplate, ScriptTemplate, ) -from bettmensch_ai.server.utils import copy_non_null_dict +from hera.workflows.models import NodeStatus as NodeStatusModel from hera.workflows.models import Workflow as WorkflowModel +from hera.workflows.models import WorkflowSpec as WorkflowSpecModel +from hera.workflows.models import WorkflowStatus as WorkflowStatusModel from pydantic import BaseModel @@ -37,20 +46,14 @@ class FlowState(BaseModel): task_results_completion_status: Optional[Dict[str, bool]] = None -# --- FlowInput -class FlowInputParameter(PipelineInputParameter): - pass - - # --- FlowNode # inputs -class FlowNodeParameterInput(NodeInput): - value: Optional[str] = None - value_from: Optional[Union[str, Dict]] = None +class FlowNodeParameterInput(NodeParameterInput): + pass -class FlowNodeArtifactInput(NodeInput): - s3_prefix: Optional[str] = None +class FlowNodeArtifactInput(NodeArtifactInput): + s3: Optional[ArtifactS3] = None class FlowNodeInputs(BaseModel): @@ -59,14 +62,17 @@ class FlowNodeInputs(BaseModel): # outputs -class FlowNodeParameterOutput(NodeOutput): +class FlowNodeParameterOutput(NodeParameterOutput): value: Optional[str] = None - value_from: Optional[Union[str, Dict]] = None -class FlowNodeArtifactOutput(NodeOutput): - path: str - s3_prefix: Optional[str] = None +class ArtifactS3(BaseModel): + bucket: Optional[str] = None + key: Optional[str] = None + + +class FlowNodeArtifactOutput(NodeArtifactOutput): + s3: Optional[ArtifactS3] = None class FlowNodeOutputs(BaseModel): @@ -98,6 +104,24 @@ class FlowNode(BaseModel): dependants: Optional[Union[str, List[str]]] = None host_node_name: Optional[str] = None + @property + def ios( + self, + ) -> List[ + Union[ + FlowNodeParameterInput, + FlowNodeParameterOutput, + FlowNodeArtifactInput, + FlowNodeArtifactOutput, + ] + ]: + return ( + self.inputs.parameters + + self.outputs.parameters + + self.inputs.artifacts + + self.outputs.artifacts + ) + # --- FlowArtifactConfiguration class FlowArtifactConfiguration(BaseModel): @@ -106,6 +130,29 @@ class FlowArtifactConfiguration(BaseModel): # --- Flow +# inputs +class FlowParameterInput(PipelineParameterInput): + pass + + +class FlowInputs(BaseModel): + parameters: List[FlowParameterInput] = [] + + +# outputs +class FlowParameterOutput(FlowNodeParameterInput): + type: str = "outputs" + + +class FlowArtifactOutput(FlowNodeArtifactInput): + type: str = "outputs" + + +class FlowOutputs(BaseModel): + parameters: List[FlowParameterOutput] = [] + artifacts: List[FlowArtifactOutput] = [] + + class Flow(Pipeline): """A flow node is an instantiated pipeline node on Kubernetes.""" @@ -113,125 +160,320 @@ class Flow(Pipeline): state: FlowState artifact_configuration: FlowArtifactConfiguration templates: List[Union[ScriptTemplate, ResourceTemplate]] - inputs: List[FlowInputParameter] = [] + inputs: Optional[FlowInputs] = None + outputs: Optional[FlowOutputs] = None dag: List[FlowNode] @classmethod - def build_dag(cls, workflow_status: Dict) -> List[FlowNode]: + def get_node_status_by_display_name( + cls, workflow_status: WorkflowStatusModel + ) -> Dict[str, NodeStatusModel]: + """Generates a display_name -> NodeStatus dictionary from the passed + workflow_status' `nodes` attribute (holds a node_id -> NodeStatusModel + dictionary). + + Args: + workflow_status (WorkflowStatusModel): The workflow status + instance. + + Returns: + Dict[str,NodeStatusModel]: The display_name -> NodeStatusModel + dictionary + """ + + return dict( + [ + (node_status.display_name, node_status) + for node_status in workflow_status.nodes.values() + ] + ) + + @classmethod + def _get_model_classes( + cls, + pipeline_io: Union[ + Type[PipelineInputs], + Type[PipelineOutputs], + Type[NodeInputs], + Type[NodeOutputs], + ], + ) -> Tuple[ + Literal["inputs", "outputs"], + Union[ + Type[FlowParameterInput], + Type[FlowParameterOutput], + Type[FlowNodeParameterInput], + Type[FlowNodeParameterOutput], + ], + Union[ + Type[FlowArtifactOutput], + Type[FlowNodeArtifactInput], + Type[FlowNodeArtifactOutput], + ], + Union[ + Type[FlowInputs], + Type[FlowOutputs], + Type[FlowNodeInputs], + Type[FlowNodeOutputs], + ], + ]: + """_summary_ + + Args: + pipeline_io (Union[ + type[PipelineInputs], + type[PipelineOutputs], + type[NodeInputs], + type[NodeOutputs] + ]): _description_ + + Returns: + Tuple[ + Union[ + type[FlowParameterInput], + type[FlowParameterOutput], + type[FlowNodeParameterInput], + type[FlowNodeParameterOutput] + ], + Union[ + type[FlowArtifactOutput], + type[FlowNodeArtifactInput], + type[FlowNodeArtifactOutput] + ], + Union[ + type[FlowInputs], + type[FlowOutputs], + type[FlowNodeInputs], + type[FlowNodeOutputs] + ] + ]: _description_ + """ + + if pipeline_io == PipelineInputs: + return "inputs", FlowParameterInput, None, FlowInputs + elif pipeline_io == PipelineOutputs: + return ( + "outputs", + FlowParameterOutput, + FlowArtifactOutput, + FlowOutputs, + ) + elif pipeline_io == NodeInputs: + return ( + "inputs", + FlowNodeParameterInput, + FlowNodeArtifactInput, + FlowNodeInputs, + ) + elif pipeline_io == NodeOutputs: + return ( + "outputs", + FlowNodeParameterOutput, + FlowNodeArtifactOutput, + FlowNodeOutputs, + ) + else: + raise TypeError( + f"Type {pipeline_io} not supported. Must be one of" + "- PipelineInputs" + "- PipelineOutputs" + "- NodeInputs" + "- NodeOutputs" + ) + + @classmethod + def _build_generic_io( + cls, + pipeline_io: Union[ + PipelineInputs, + PipelineOutputs, + NodeInputs, + NodeOutputs, + ], + flow_node: NodeStatusModel, + ) -> Union[FlowInputs, FlowOutputs, FlowNodeInputs, FlowNodeOutputs]: + """Parametrizable utility function to build either inputs or outputs + for the Flow or one of its FlowNodes using Pipeline or Node I/Os. + + Args: + pipeline_io (Union[ + PipelineInputs, + PipelineOutputs, + NodeInputs, + NodeOutputs, + ]): _description_ + flow_node (NodeStatusModel): A pipeline or pipeline node I/O + instance + + Returns: + Union[FlowInputs, FlowOutputs, FlowNodeInputs, FlowNodeOutputs]: + A flow or flow node I/O instance + """ + + io = {"parameters": [], "artifacts": []} + + ( + io_type, + parameter_io_class, + artifact_io_class, + io_class, + ) = cls._get_model_classes(pipeline_io.__class__) + + # parameters + for ppo in pipeline_io.parameters: + flow_parameter_io_data = ppo.model_dump() + try: + fpo_value = [ + fpo + for fpo in getattr(flow_node, io_type).parameters + if fpo.name == ppo.name + ][0].value + except (AttributeError, IndexError): + fpo_value = None + finally: + flow_parameter_io_data["value"] = fpo_value + flow_parameter_output = parameter_io_class.model_validate( + flow_parameter_io_data + ) + io["parameters"].append(flow_parameter_output) + + # artifacts + if artifact_io_class is not None: + for pao in pipeline_io.artifacts: + flow_artifact_io_data = pao.model_dump() + try: + fao_s3 = [ + fao + for fao in getattr(flow_node, io_type).artifacts + if fao.name == pao.name + ][0].s3.dict() + except (AttributeError, IndexError): + fao_s3 = None + finally: + flow_artifact_io_data["s3"] = fao_s3 + flow_parameter_output = artifact_io_class.model_validate( + flow_artifact_io_data + ) + io["artifacts"].append(flow_parameter_output) + else: + del io["artifacts"] + + return io_class.model_validate(io) + + @classmethod + def build_io( + cls, + workflow_template_spec: WorkflowSpecModel, + workflow_nodes_dict: Dict[str, NodeStatusModel], + ) -> Union[FlowInputs, FlowOutputs]: + """Build the io attributes of a Flow instance. + + Args: + inner_dag_node (NodeStatusModel): The NodeStatus of the Flow's + Workflow's inner dag. + + + Returns: + Union[FlowInputs,FlowOutputs]: The data for the Flow instance's + `inputs` and `outputs` attributes + """ + + inner_dag_node = workflow_nodes_dict["bettmensch-ai-inner-dag"] + pipeline_inputs, pipeline_outputs = super().build_io( + workflow_template_spec + ) + + # --- inputs + flow_inputs = cls._build_generic_io(pipeline_inputs, inner_dag_node) + + # --- outputs + flow_outputs = cls._build_generic_io(pipeline_outputs, inner_dag_node) + + return flow_inputs, flow_outputs + + @classmethod + def build_flow_node( + cls, + pipeline_node: PipelineNode, + workflow_nodes_dict: Dict[str, NodeStatusModel], + ) -> FlowNode: + """Builds a FlowNode + + Args: + pipeline_node (PipelineNode): _description_ + workflow_nodes_dict (Dict[str,NodeStatusModel]): _description_ + + Returns: + FlowNode: _description_ + """ + + flow_node_data = { + "name": pipeline_node.name, + "template": pipeline_node.template, + "depends": pipeline_node.depends, + } + + flow_node = workflow_nodes_dict.get(pipeline_node.name, None) + + if flow_node is None: + flow_node_data["pod_name"] = pipeline_node.name + flow_node_data["phase"] = "Not Scheduled" + flow_node_inputs, flow_node_outputs = ( + pipeline_node.inputs.model_dump(), + pipeline_node.outputs.model_dump(), + ) + else: + flow_node_data["id"] = flow_node.id + flow_node_data["type"] = flow_node.type + flow_node_data["pod_name"] = flow_node.name + flow_node_data["phase"] = flow_node.phase + flow_node_data["dependants"] = getattr(flow_node, "children", None) + flow_node_data["host_node_name"] = getattr( + flow_node, "host_node_name", None + ) + flow_node_inputs = cls._build_generic_io( + pipeline_node.inputs, flow_node + ) + flow_node_outputs = cls._build_generic_io( + pipeline_node.outputs, flow_node + ) + + flow_node_data["inputs"] = flow_node_inputs + flow_node_data["outputs"] = flow_node_outputs + + flow_node = FlowNode.model_validate(flow_node_data) + + return flow_node + + @classmethod + def build_dag( + cls, + workflow_status: WorkflowStatusModel, + workflow_nodes_dict: Dict[str, NodeStatusModel], + ) -> List[FlowNode]: """Utility to build the Flow class' dag attribute. Identical to the Pipeline class' dag attribute, but with additional values resolved at runtime. Args: - workflow_status (_type_): The status field of a dict-ified - IoArgoprojWorkflowV1alpha1Workflow class instance + workflow_status (WorkflowStatusModel): The status field of a hera + Workflow model instance. Returns: List[FlowNode]: The constructed dag attribute of a Flow instance. """ # build pipeline dag - workflow_template_spec = workflow_status[ - "stored_workflow_template_spec" - ] - pipeline_dag = super().build_dag(workflow_template_spec) + pipeline_dag = super().build_dag( + workflow_status.stored_workflow_template_spec + ) # add FlowNode specific values and available resolved input/output # values for each FlowNode flow_dag = [] - workflow_nodes = list(workflow_status["nodes"].values()) - - print( - f"Workflow node display names:{[wn['display_name'] for wn in workflow_nodes]}" # noqa: E501 - ) for pipeline_node in pipeline_dag: - pipeline_node_dict = pipeline_node.model_dump() - print(f"Pipeline node name: {pipeline_node_dict['name']}") - - flow_node_dict = { - "name": pipeline_node_dict["name"], - "template": pipeline_node_dict["template"], - "inputs": pipeline_node_dict["inputs"], - "depends": pipeline_node_dict["depends"], - } - - potential_workflow_node_dict = [ - workflow_node - for workflow_node in workflow_nodes - if workflow_node["display_name"] == pipeline_node_dict["name"] - ] - - if potential_workflow_node_dict: - workflow_node_dict = copy_non_null_dict( - potential_workflow_node_dict[0] - ) - - flow_node_dict["id"] = workflow_node_dict["id"] - flow_node_dict["type"] = workflow_node_dict["type"] - flow_node_dict["pod_name"] = workflow_node_dict["name"] - flow_node_dict["phase"] = workflow_node_dict["phase"] - flow_node_dict["outputs"] = dict( - **pipeline_node_dict["outputs"], - **{"exit_code": workflow_node_dict.get("exit_code")}, - ) - flow_node_dict["logs"] = None - flow_node_dict["dependants"] = workflow_node_dict.get( - "children" - ) - flow_node_dict["host_node_name"] = workflow_node_dict.get( - "host_node_name" - ) - - # inject resolved input values where possible - for argument_io in ("inputs", "outputs"): - for argument_type in ("parameters", "artifacts"): - try: - workflow_node_arguments = workflow_node_dict[ - argument_io - ][argument_type] - flow_node_arguments = flow_node_dict[argument_io][ - argument_type - ] - - if workflow_node_arguments is None: - continue - else: - for i, argument in enumerate( - workflow_node_arguments - ): - if i < len(flow_node_arguments): - if ( - flow_node_arguments[i]["name"] - == argument["name"] - ): - if argument_type == "parameters": - flow_node_arguments[i][ - "value" - ] = argument["value"] - elif argument_type == "artifacts": - flow_node_arguments[i][ - "s3_prefix" - ] = argument["s3"]["key"] - elif argument["name"] == "main-logs": - flow_node_dict["logs"] = argument - else: - pass - except KeyError: - pass - - else: - flow_node_dict["pod_name"] = pipeline_node_dict["name"] - flow_node_dict["phase"] = "Not Scheduled" - flow_node_dict["outputs"] = dict( - **pipeline_node_dict["outputs"], - ) - flow_node_dict["logs"] = None - try: - flow_node = FlowNode(**flow_node_dict) - except Exception as e: - raise (e) - + flow_node = cls.build_flow_node(pipeline_node, workflow_nodes_dict) flow_dag.append(flow_node) return flow_dag @@ -252,46 +494,45 @@ def from_hera_workflow_model(cls, workflow_model: WorkflowModel) -> Flow: Returns: Flow: A Flow class instance. """ - - workflow_dict = workflow_model.dict() - workflow_spec = workflow_dict["spec"].copy() - workflow_status = workflow_dict["status"].copy() - workflow_template_spec = workflow_status[ - "stored_workflow_template_spec" - ].copy() + workflow_status = workflow_model.status # metadata - metadata = FlowMetadata(**workflow_dict["metadata"]) + metadata = FlowMetadata(**workflow_model.metadata.dict()) # state - state = FlowState(**workflow_status) + state = FlowState(**workflow_status.dict()) # artifact_configuration artifact_configuration = FlowArtifactConfiguration( - repository_ref=workflow_status["artifact_gc_status"], - gc_status=workflow_status["artifact_repository_ref"], + repository_ref=workflow_status.artifact_gc_status.dict(), + gc_status=workflow_status.artifact_repository_ref.dict(), ) # templates - entrypoint_template = workflow_template_spec["entrypoint"] templates = [] - for template in workflow_template_spec["templates"]: - # we are not interested in the entrypoint template - if template["name"] == entrypoint_template: - continue - elif template["script"] is not None: - templates.append(ScriptTemplate.model_validate(template)) - elif template["resource"] is not None: - templates.append(ResourceTemplate.model_validate(template)) - - # inputs - inputs = [ - FlowInputParameter(**parameter) - for parameter in workflow_spec["arguments"]["parameters"] - ] + for ( + template + ) in workflow_status.stored_workflow_template_spec.templates: + # we are only interested in Script and Resource type templates + if template.script is not None: + templates.append( + ScriptTemplate.model_validate(template.dict()) + ) + elif template.resource is not None: + templates.append( + ResourceTemplate.model_validate(template.dict()) + ) + + # io + workflow_nodes_dict = cls.get_node_status_by_display_name( + workflow_status + ) + inputs, outputs = cls.build_io( + workflow_status.stored_workflow_template_spec, workflow_nodes_dict + ) # dag - dag = cls.build_dag(workflow_status) + dag = cls.build_dag(workflow_status, workflow_nodes_dict) return cls( metadata=metadata, @@ -299,5 +540,6 @@ def from_hera_workflow_model(cls, workflow_model: WorkflowModel) -> Flow: artifact_configuration=artifact_configuration, templates=templates, inputs=inputs, + outputs=outputs, dag=dag, ) diff --git a/sdk/bettmensch_ai/server/pipeline.py b/sdk/bettmensch_ai/server/pipeline.py index 36811b6..9e2fbd9 100644 --- a/sdk/bettmensch_ai/server/pipeline.py +++ b/sdk/bettmensch_ai/server/pipeline.py @@ -11,6 +11,13 @@ DagVisualizationItems, ) from bettmensch_ai.server.utils import copy_non_null_dict +from hera.workflows.models import Artifact as ArtifactModel +from hera.workflows.models import DAGTask as DAGTaskModel +from hera.workflows.models import Metadata as PodMetadataModel +from hera.workflows.models import Parameter as ParameterModel +from hera.workflows.models import Template as TemplateModel +from hera.workflows.models import WorkflowMetadata as WorkflowMetadataModel +from hera.workflows.models import WorkflowSpec as WorkflowSpecModel from hera.workflows.models import WorkflowTemplate as WorkflowTemplateModel from pydantic import BaseModel @@ -20,11 +27,14 @@ "task": "⤵️", # :arrow_heading_down: "pipeline": "⏬", # :arrow_double_down: }, - "outputs": {"task": "↪️"}, # :arrow_right_hook: + "outputs": {"task": "↪️", "pipeline": "↪️"}, # :arrow_right_hook: "parameters": "📃", # :page_with_curl "artifacts": "📂", # :open_file_folder: } +INNER_PIPELINE_DAG = "bettmensch-ai-inner-dag" +OUTER_PIPELINE_DAG = "bettmensch-ai-outer-dag" + # --- PipelineMetadata class WorkflowTemplateMetadata(BaseModel): @@ -37,8 +47,8 @@ class WorkflowTemplateMetadata(BaseModel): class PipelineMetadata(BaseModel): pipeline: WorkflowTemplateMetadata - flow: Optional[Dict] = None - component: Optional[Dict] = None + flow: Optional[WorkflowMetadataModel] = None + component: Optional[PodMetadataModel] = None # --- ScriptTemplate @@ -108,33 +118,29 @@ class ResourceTemplate(BaseModel): resource: Resource -# --- PipelineInput -class PipelineInputParameter(BaseModel): - name: str - value: Optional[str] = None - default: Optional[Any] = None - - # --- PipelineNode # inputs class NodeInputSource(BaseModel): - node: str - output_name: str - output_type: Literal["parameters", "artifacts"] + node_name: str + io_type: Literal["inputs", "outputs"] + io_argument_type: Literal["parameters", "artifacts"] + io_name: str class NodeInput(BaseModel): name: str source: Optional[NodeInputSource] = None + type: str = "inputs" class NodeParameterInput(NodeInput): value: Optional[str] = None value_from: Optional[Union[str, Dict]] = None + argument_type: str = "parameters" class NodeArtifactInput(NodeInput): - pass + argument_type: str = "artifacts" class NodeInputs(BaseModel): @@ -145,14 +151,17 @@ class NodeInputs(BaseModel): # outputs class NodeOutput(BaseModel): name: str + type: str = "outputs" class NodeParameterOutput(NodeOutput): value_from: Optional[Union[str, Dict]] = None + argument_type: str = "parameters" class NodeArtifactOutput(NodeOutput): path: str + argument_type: str = "artifacts" class NodeOutputs(BaseModel): @@ -169,14 +178,69 @@ class PipelineNode(BaseModel): outputs: NodeOutputs depends: List[str] = [] + @property + def ios( + self, + ) -> List[ + Union[ + NodeParameterInput, + NodeParameterOutput, + NodeArtifactInput, + NodeArtifactOutput, + ] + ]: + return ( + self.inputs.parameters + + self.outputs.parameters + + self.inputs.artifacts + + self.outputs.artifacts + ) + # --- Pipeline +# inputs +class PipelineParameterInput(BaseModel): + name: str + value: Optional[str] = None + type: str = "inputs" + argument_type: str = "parameters" + + +class PipelineInputs(BaseModel): + parameters: List[PipelineParameterInput] = [] + + +# outputs +class PipelineParameterOutput(NodeParameterInput): + type: str = "outputs" + + +class PipelineArtifactOutput(NodeArtifactInput): + type: str = "outputs" + + +class PipelineOutputs(BaseModel): + parameters: List[PipelineParameterOutput] = [] + artifacts: List[PipelineArtifactOutput] = [] + + class Pipeline(BaseModel): metadata: PipelineMetadata templates: List[Union[ScriptTemplate, ResourceTemplate]] - inputs: List[PipelineInputParameter] = [] + inputs: Optional[PipelineInputs] = None + outputs: Optional[PipelineOutputs] = None dag: List[PipelineNode] + @property + def ios( + self, + ) -> List[Union[PipelineInputs, PipelineOutputs, PipelineArtifactOutput]]: + return ( + self.inputs.parameters + + self.outputs.parameters + + self.outputs.artifacts + ) + def get_template(self, name: str) -> ScriptTemplate: return [ @@ -188,7 +252,11 @@ def get_dag_task(self, name: str) -> PipelineNode: return [task for task in self.dag if task.name == name][0] @staticmethod - def resolve_value_expression(expression: str) -> Tuple[str, str, str]: + def contains_parameter_reference(value: str) -> bool: + return "{{" in value + + @staticmethod + def resolve_parameter_reference(reference: str) -> Tuple[str, str, str]: """Utility to resolve a node argument's value expression to the node and output references. @@ -208,129 +276,365 @@ def resolve_value_expression(expression: str) -> Tuple[str, str, str]: # '{{workflow.parameters.coin}}' -> 'workflow.parameters.coin' # '{{tasks.Set-a-coin.outputs.parameters.coin}}' -> # 'tasks.Set-a-coin.outputs.parameters.coin' - expression_content = expression.replace("{{", "").replace("}}", "") + expression_content = reference.replace("{{", "").replace("}}", "") - # 'workflow.parameters.coin' -> ['workflow','parameters','coin'] + # 'inputs.parameters.coin' -> ['inputs','parameters','coin'] # 'tasks.Set-a-coin.outputs.parameters.coin' -> # ['tasks','Set-a-coin','outputs','parameters','coin'] tokens = expression_content.split(".") - if tokens[0] == "workflow": - # ('pipeline','parameters','coin') - return ("pipeline", tokens[1], tokens[2]) + if tokens[0] == "inputs": + # ('pipeline','inputs','parameters','coin') + return ("pipeline", tokens[0], tokens[1], tokens[2]) elif tokens[0] == "tasks": - # ('Set-a-coin','parameters','coin') - return (tokens[1], tokens[3], tokens[4]) + # ('Set-a-coin','outputs','parameters','coin') + return (tokens[1], tokens[2], tokens[3], tokens[4]) else: raise ValueError(f"First token {tokens[0]} not supported.") @classmethod - def build_dag(cls, workflow_template_spec: Dict) -> List[PipelineNode]: - """Utility to build the Pipeline class' dag attribute. + def get_relevant_templates( + cls, workflow_template_spec: WorkflowSpecModel + ) -> Tuple[TemplateModel, Dict[str, TemplateModel]]: + """Converts a workflow template spec instance into a tuple with the + inner dag template in the first entry, and a template.name -> template + dictionary in the second entry. Args: - workflow_template_spec (Dict): The spec field of a dict-ified - hera.workflows.WorkflowTemplate class instance + workflow_template_spec (WorkflowSpecModel): The workflow template + spec. Returns: - List[PipelineNode]: The constructed dag attribute of a Pipeline - instance. + Tuple[TemplateModel, Dict[str,TemplateModel]]: The inner dag + template and a dict with the remaining templates (excluding the + outer dag template). """ - dag = [] - templates_dict = dict( + + component_templates_dict = dict( [ - (template["name"], template) - for template in workflow_template_spec["templates"] + (template.name, template) + for template in workflow_template_spec.templates ] ) - entrypoint_template = templates_dict.pop( - workflow_template_spec["entrypoint"] + + # remove outer dag template + inner_dag_template = component_templates_dict.pop(INNER_PIPELINE_DAG) + + return inner_dag_template, component_templates_dict + + @classmethod + def build_pipeline_output_parameter( + cls, output_parameter: ParameterModel + ) -> PipelineParameterOutput: + """Builds and returns a PipelineParameterOutput instance from the + provided ParameterModel instance. Checks for source parameter + references and resolves them into a NodeInputSource attribute on the + return. + + Raises a ValueError if no such reference can be found. + + Args: + output_parameter (ParameterModel): The output parameter + Returns: + PipelineParameterOutput: The assembled pipeline parameter + output. + """ + + try: + output_parameter_value_from_parameter = ( + output_parameter.value_from.parameter + ) + assert cls.contains_parameter_reference( + output_parameter_value_from_parameter + ) + except (AttributeError, AssertionError): + raise ValueError( + "Pipeline parameter output must provide a" + " source via its `value_from.parameter` attribute," + f" but seems to be missing: {output_parameter}" + ) + ( + upstream_node, + io_type, + io_argument_type, + io_name, + ) = cls.resolve_parameter_reference( + output_parameter_value_from_parameter + ) + return PipelineParameterOutput( + name=output_parameter.name, + source=NodeInputSource( + node_name=upstream_node, + io_type=io_type, + io_argument_type=io_argument_type, + io_name=io_name, + ), + ) + + @classmethod + def build_pipeline_output_artifact( + cls, output_artifact: ArtifactModel + ) -> PipelineArtifactOutput: + """Builds and returns a PipelineArtifactOutput instance from the + provided ArtifactModel instance. Checks for source artifact references + and resolves them into a NodeInputSource attribute on the return. + + Raises a ValueError if no such reference can be found. + + Args: + output_artifact (ArtifactModel): The output artifact + + Returns: + PipelineArtifactOutput: The assembled pipeline artifact output. + """ + + output_artifact_from_ = getattr(output_artifact, "from_", None) + + try: + output_artifact_from_ = output_artifact.from_ + assert cls.contains_parameter_reference(output_artifact_from_) + except (AttributeError, AssertionError): + raise ValueError( + "Pipeline artifact input must provide a" + " source via its `from_` attribute, but seems to" + f" be missing: {output_artifact_from_}" + ) + + ( + upstream_node, + io_type, + io_argument_type, + io_name, + ) = cls.resolve_parameter_reference(output_artifact_from_) + return PipelineArtifactOutput( + name=output_artifact.name, + source=NodeInputSource( + node_name=upstream_node, + io_type=io_type, + io_argument_type=io_argument_type, + io_name=io_name, + ), + ) + + @classmethod + def build_io( + cls, workflow_template_spec: WorkflowSpecModel + ) -> Tuple[PipelineInputs, PipelineOutputs]: + + inner_dag_template, _ = cls.get_relevant_templates( + workflow_template_spec + ) + + # add pipeline inputs directly from inner dag template + if inner_dag_template.inputs is not None: + if inner_dag_template.inputs.parameters is not None: + pipeline_inputs = PipelineInputs( + parameters=[ + PipelineParameterInput.model_validate( + dag_input_parameter.dict() + ) + for dag_input_parameter in inner_dag_template.inputs.parameters # noqa: E501 + ] + ) + else: + pipeline_inputs = PipelineInputs(parameters=[], artifacts=[]) + + pipeline_outputs = {"parameters": [], "artifacts": []} + + if inner_dag_template.outputs is not None: + # output parameters + output_parameters = inner_dag_template.outputs.parameters + if output_parameters is not None: + for output_parameter in output_parameters: + pipeline_output_parameter = ( + cls.build_pipeline_output_parameter(output_parameter) + ) + pipeline_outputs["parameters"].append( + pipeline_output_parameter + ) + + # output artifacts + output_artifacts = inner_dag_template.outputs.artifacts + if output_artifacts is not None: + for output_artifact in output_artifacts: + pipeline_output_artifact = ( + cls.build_pipeline_output_artifact(output_artifact) + ) + pipeline_outputs["artifacts"].append( + pipeline_output_artifact + ) + + pipeline_outputs = PipelineOutputs.model_validate(pipeline_outputs) + + return pipeline_inputs, pipeline_outputs + + @classmethod + def build_pipeline_node_input_parameter( + cls, input_parameter: ParameterModel + ) -> NodeParameterInput: + """Builds and returns a NodeParameterInput instance from the provided + ParameterModel instance. Checks for source parameter references and + resolves them into a NodeInputSource attribute on the return when + needed. + + Args: + input_parameter (ParameterModel): The input parameter + + Returns: + NodeParameterInput: The assembled pipeline node parameter input. + """ + + input_parameter_value = getattr(input_parameter, "value", None) + + if not cls.contains_parameter_reference(input_parameter_value): + return NodeParameterInput.model_validate(input_parameter.dict()) + else: + ( + upstream_node, + io_type, + io_argument_type, + io_name, + ) = cls.resolve_parameter_reference(input_parameter_value) + return NodeParameterInput( + name=input_parameter.name, + source=NodeInputSource( + node_name=upstream_node, + io_type=io_type, + io_argument_type=io_argument_type, + io_name=io_name, + ), + ) + + @classmethod + def build_pipeline_node_input_artifact( + cls, input_artifact: ArtifactModel + ) -> NodeArtifactInput: + """Builds and returns a NodeArtifactInput instance from the provided + ArtifactModel instance. Checks for source artifact references and + resolves them into a NodeInputSource attribute on the return. + + Raises a ValueErro if no such reference could be found. + + Args: + input_artifact (ArtifactModel): The input artifact + + Returns: + NodeArtifactInput: The assembled pipeline node artifact input. + """ + + try: + input_artifact_from_ = input_artifact.from_ + assert cls.contains_parameter_reference(input_artifact_from_) + except (AttributeError, AssertionError): + raise ValueError( + "Pipeline node artifact input must provide a" + " source via its `from_` attribute, but seems to" + f" be missing: {input_artifact_from_}" + ) + + ( + upstream_node, + io_type, + io_argument_type, + io_name, + ) = cls.resolve_parameter_reference(input_artifact_from_) + return NodeArtifactInput( + name=input_artifact.name, + source=NodeInputSource( + node_name=upstream_node, + io_type=io_type, + io_argument_type=io_argument_type, + io_name=io_name, + ), ) - tasks = entrypoint_template["dag"]["tasks"] - - for task in tasks: - # assemble task data structure: name, template and depends can be - # copied straight from the task entry of the dag template - pipeline_node = { - "name": task["name"], - "template": task["template"], - "depends": task["depends"].split(" && ") - if task["depends"] is not None - else [], - } - # the outputs can be obtained from the reference template's outputs + + @classmethod + def build_pipeline_node( + cls, + task: DAGTaskModel, + component_templates_dict: Dict[str, TemplateModel], + ) -> PipelineNode: + + # initialize pipeline node dict container with basic attributes + pipeline_node = { + "name": task.name, + "template": task.template, + "depends": task.depends.split(" && ") + if task.depends is not None + else [], + } + + # add node outputs directly from template + node_outputs = component_templates_dict[task.template].outputs + if node_outputs is not None: pipeline_node["outputs"] = NodeOutputs( **copy_non_null_dict( - templates_dict[pipeline_node["template"]]["outputs"] + component_templates_dict[task.template].outputs.dict() ) ) + else: + pipeline_node["outputs"] = NodeOutputs(parameters=[], artifacts=[]) - # the inputs need to resolve the expressions to either the pipeline - # or reference task in the expression if no expression is used, the - # argument spec can be directly appended to the corresponding - # parameters/artifacts list - pipeline_node_inputs = {"parameters": [], "artifacts": []} - # try: - # node_input_parameters = task["arguments"]["parameters"] - # except KeyError: - # node_input_parameters = [] - if task["arguments"]["parameters"] is not None: - node_input_parameters = task["arguments"]["parameters"] - else: - node_input_parameters = [] - - # try: - # node_input_artifacts = task["arguments"]["artifacts"] - # except KeyError: - # node_input_artifacts = [] - if task["arguments"]["artifacts"] is not None: - node_input_artifacts = task["arguments"]["artifacts"] - else: - node_input_artifacts = [] - - # build parameter inputs - for node_argument in node_input_parameters: - if "{{" not in node_argument["value"]: - pipeline_node_inputs["parameters"].append( - NodeParameterInput(**node_argument) - ) - elif node_argument.get("value") is not None: - ( - upstream_node, - output_type, - output_name, - ) = cls.resolve_value_expression(node_argument["value"]) - pipeline_node_inputs["parameters"].append( - NodeParameterInput( - name=node_argument["name"], - source=NodeInputSource( - node=upstream_node, - output_name=output_name, - output_type=output_type, - ), + # add node inputs from task arguments; resolve any source references + # into custom NodeInputSource + pipeline_node["inputs"] = {"parameters": [], "artifacts": []} + + if task.arguments is not None: + + # parameters + input_parameters = task.arguments.parameters + + if input_parameters is not None: + for input_parameter in input_parameters: + pipeline_node_input_parameter = ( + cls.build_pipeline_node_input_parameter( + input_parameter ) ) + pipeline_node["inputs"]["parameters"].append( + pipeline_node_input_parameter + ) - # build artifact inputs - for node_argument in node_input_artifacts: - ( - upstream_node, - output_type, - output_name, - ) = cls.resolve_value_expression(node_argument["from_"]) - pipeline_node_inputs["artifacts"].append( - NodeArtifactInput( - name=node_argument["name"], - source=NodeInputSource( - node=upstream_node, - output_name=output_name, - output_type=output_type, - ), + # artifacts + input_artifacts = task.arguments.artifacts + if input_artifacts is not None: + for input_artifact in input_artifacts: + pipeline_node_input_artifact = ( + cls.build_pipeline_node_input_artifact(input_artifact) ) - ) + pipeline_node["inputs"]["artifacts"].append( + pipeline_node_input_artifact + ) + + return PipelineNode.model_validate(pipeline_node) + + @classmethod + def build_dag( + cls, workflow_template_spec: WorkflowSpecModel + ) -> List[PipelineNode]: + """Utility to build the Pipeline class' dag attribute. + + Args: + workflow_template_spec (WorkflowSpecModel): The spec field of a + hera.workflows.WorkflowTemplate class instance + + Returns: + List[PipelineNode]: The constructed dag attribute of a Pipeline + instance. + """ + + ( + inner_dag_template, + component_templates_dict, + ) = cls.get_relevant_templates(workflow_template_spec) - pipeline_node["inputs"] = pipeline_node_inputs - dag.append(PipelineNode.model_validate(pipeline_node)) + dag = [] + + for task in inner_dag_template.dag.tasks: + pipeline_node = cls.build_pipeline_node( + task, component_templates_dict + ) + dag.append(pipeline_node) return dag @@ -355,49 +659,139 @@ def from_hera_workflow_template_model( Pipeline: A Pipeline class instance. """ - workflow_template_dict = workflow_template_resource.dict() - workflow_template_spec = workflow_template_dict["spec"].copy() + workflow_template_spec: WorkflowSpecModel = ( + workflow_template_resource.spec + ) # metadata metadata = PipelineMetadata( - pipeline=workflow_template_dict["metadata"], - flow=workflow_template_spec.get("workflow_metadata", None), - component=workflow_template_spec.get("pod_metadata", None), + pipeline=workflow_template_resource.metadata.dict(), + flow=getattr(workflow_template_spec, "workflow_metadata", None), + component=getattr(workflow_template_spec, "pod_metadata", None), ) # templates - entrypoint_template = workflow_template_spec["entrypoint"] templates = [] - for template in workflow_template_spec["templates"]: - # we are not interested in the entrypoint template - if template["name"] == entrypoint_template: - continue - elif template["script"] is not None: - templates.append(ScriptTemplate.model_validate(template)) - elif template["resource"] is not None: - templates.append(ResourceTemplate.model_validate(template)) - - # inputs - inputs = [ - PipelineInputParameter(**parameter) - for parameter in workflow_template_spec["arguments"]["parameters"] - ] + for template in workflow_template_spec.templates: + # we are only interested in Script and Resource type templates + if template.script is not None: + templates.append( + ScriptTemplate.model_validate(template.dict()) + ) + elif template.resource is not None: + templates.append( + ResourceTemplate.model_validate(template.dict()) + ) + + # io + inputs, outputs = cls.build_io(workflow_template_spec) # dag dag = cls.build_dag(workflow_template_spec) return cls( - metadata=metadata, templates=templates, inputs=inputs, dag=dag + metadata=metadata, + templates=templates, + inputs=inputs, + outputs=outputs, + dag=dag, ) @classmethod - def transform_dag_visualization_node_position( - cls, x_y: Tuple[float, float] - ) -> Tuple[float, float]: + def build_visualization_pipeline_io_node( + cls, + io: Union[ + PipelineArtifactOutput, + PipelineParameterInput, + PipelineParameterOutput, + ], + ) -> DagPipelineIONode: + """Builds and returns a visualization node that represents a DAG's I/O + attribute in the visualization's diagram. + + Args: + io (Union[PipelineArtifactOutput,PipelineParameterInput,PipelineParameterOutput]): # noqa: E501 + The I/O instance + + Returns: + DagPipelineIONode: The instance representing the DAG's I/O attibute + in the visualization's diagram. + """ + pipeline_io_node_name = ( + f"pipeline_{io.type}_{io.argument_type}_{io.name}" + ) + return DagPipelineIONode( + id=pipeline_io_node_name, + data={ + "label": f"{PIPELINE_NODE_EMOJI_MAP[io.type]['pipeline']} {PIPELINE_NODE_EMOJI_MAP[io.argument_type]} {io.name}", # noqa: E501 + "value": getattr(io, "value", None), + }, + ) + + @classmethod + def build_visualization_task_io_node( + cls, + task_name: str, + io: Union[ + NodeParameterInput, + NodeArtifactInput, + NodeParameterOutput, + NodeArtifactOutput, + ], + ) -> DagTaskIONode: + """Builds and returns a visualization node that represents a DAG's + task's I/O attribute in the visualization's diagram. + + Args: + task_name (str): The name of the task + io (Union[NodeParameterInput, NodeArtifactInput, NodeParameterOutput, NodeArtifactOutput]): # noqa: E501 + The I/O instance. + + Returns: + DagTaskIONode: The instance representing the task's I/O attibute in + the visualization's diagram. + """ + task_io_node_name = ( + f"{task_name}_{io.type}_{io.argument_type}_{io.name}" + ) + + return DagTaskIONode( + id=task_io_node_name, + data={ + "label": f"{PIPELINE_NODE_EMOJI_MAP[io.type]['task']} {PIPELINE_NODE_EMOJI_MAP[io.argument_type]} {io.name}", # noqa: E501 + "value": getattr(io, "value", None), + }, + ) + + @classmethod + def build_visualization_connection( + cls, + source_node_name: str, + target_node_name: str, + animated: bool = True, + ) -> DagConnection: + """Builds and returns a visualization edge that represents a dependency + between the specified source and target nodes (which can represent a + DAG's task or an I/O, respectively) the visualization's diagram. - transformed_x_y = 350 * x_y[0], 150 * x_y[1] + Args: + source_node_name (str): _description_ + target_node_name (str): _description_ + animated (bool, optional): _description_. Defaults to True. + + Returns: + DagConnection: The instance representing the dependency between the + specified source and target nodes in the visualization's + diagram. + """ - return transformed_x_y + return DagConnection( + id=f"{source_node_name}->{target_node_name}", + source=source_node_name, + target=target_node_name, + animated=animated, + edge_type="smoothstep", + ) def create_dag_visualization_schema( self, @@ -414,116 +808,116 @@ def create_dag_visualization_schema( visualizing the DAG on the dashboard. """ - connections: List[Dict] = [] - nodes: List[Dict] = [] + vis_connections: List[Dict] = [] + vis_nodes: List[Dict] = [] for task_node in self.dag: - task_node_name = task_node.name - - nodes.append( - DagTaskNode( - id=task_node_name, - data={ - "label": f"{PIPELINE_NODE_EMOJI_MAP['task']} {task_node_name}" # noqa: E501 - }, - ) + vis_task_node = DagTaskNode( + id=task_node.name, + data={ + "label": f"{PIPELINE_NODE_EMOJI_MAP['task']} {task_node.name}" # noqa: E501 + }, ) + vis_nodes.append(vis_task_node) # we only create task_node <-> task_node connections if we dont # display the tasks' IO specs - if not include_task_io: - if task_node.depends is not None: - for upstream_node_name in task_node.depends: - connections.append( - DagConnection( - id=f"{upstream_node_name}->{task_node_name}", - source=upstream_node_name, - target=task_node_name, - animated=True, - edge_type="smoothstep", - ) + # if not include_task_io: + if task_node.depends is not None: + for upstream_node_name in task_node.depends: + # add the task->task connection + task_to_task_connection = ( + self.build_visualization_connection( + source_node_name=upstream_node_name, + target_node_name=task_node.name, + animated=not include_task_io, ) - # if we include the tasks' IO specs, we need to draw - # - io nodes and - # connections between - # - inputs and outputs, and - # - inputs/outputs and associated task_nodes - else: - for interface_type in ["inputs", "outputs"]: - - for argument_type in ["parameters", "artifacts"]: - arguments = getattr( - getattr(task_node, interface_type), argument_type + ) + vis_connections.append(task_to_task_connection) + + # if we include the tasks' I/O specs, we need to draw + # - task I/O nodes + # and + # - task_input->task, + # - source_(input/output)->task_input, + # - task->task_output + # connections + if include_task_io: + for io in task_node.ios: + # add the task I/O node + vis_task_io_node = self.build_visualization_task_io_node( + task_name=task_node.name, io=io + ) + vis_nodes.append(vis_task_io_node) + + if io.type == "inputs": + # add the task_input->task connection + task_input_to_task_connection = ( + self.build_visualization_connection( + source_node_name=vis_task_io_node.id, + target_node_name=task_node.name, + animated=False, + ) ) - if not arguments: - continue - - for argument in arguments: - # add the task io node - task_io_node_name = f"{task_node_name}_{interface_type}_{argument_type}_{argument.name}" # noqa: E501 - nodes.append( - DagTaskIONode( - id=task_io_node_name, - data={ - "label": f"{PIPELINE_NODE_EMOJI_MAP[interface_type]['task']} {PIPELINE_NODE_EMOJI_MAP[argument_type]} {argument.name}", # noqa: E501 - "value": getattr( - argument, "value", None - ), - }, + vis_connections.append(task_input_to_task_connection) + + try: + # add the source_(input/output)->task_input + # connection + assert io.source is not None + vis_source_io_node_name = f"{io.source.node_name}_{io.source.io_type}_{io.source.io_argument_type}_{io.source.io_name}" # noqa: E501 + source_io_to_task_input_connection = ( + self.build_visualization_connection( + source_node_name=vis_source_io_node_name, + target_node_name=vis_task_io_node.id, ) ) - - # connect that task io node with the task node - if interface_type == "inputs": - upstream_node_name = task_io_node_name - node_name = task_node_name - else: - upstream_node_name = task_node_name - node_name = task_io_node_name - - connections.append( - DagConnection( - id=f"{upstream_node_name}->{node_name}", - source=upstream_node_name, - target=node_name, - animated=False, - edge_type="smoothstep", - ) + vis_connections.append( + source_io_to_task_input_connection ) + except (AttributeError, AssertionError): + pass + + elif io.type == "outputs": + # add the task->task_output + task_to_task_output_connection = ( + self.build_visualization_connection( + source_node_name=task_node.name, + target_node_name=vis_task_io_node.id, + animated=False, + ) + ) + vis_connections.append(task_to_task_output_connection) - # connect the input type task io node with the - # upstream output type task io node - where - # appropriate - if ( - interface_type == "inputs" - and getattr(argument, "source", None) - is not None - ): - task_io_source = argument.source - upstream_node_name = f"{task_io_source.node}_outputs_{task_io_source.output_type}_{task_io_source.output_name}" # noqa: E501 - connections.append( - DagConnection( - id=f"{upstream_node_name}->{task_io_node_name}", # noqa: E501 - source=upstream_node_name, - target=task_io_node_name, - animated=True, - edge_type="smoothstep", - ) - ) - + # if we include the tasks' I/O specs, we need to draw + # - pipeline I/O nodes + # and + # - source_task_output->pipeline_output + # connections if include_task_io: - for input in self.inputs: - node_name = f"pipeline_outputs_parameters_{input.name}" - nodes.append( - DagPipelineIONode( - id=node_name, - data={ - "label": f"{PIPELINE_NODE_EMOJI_MAP['inputs']['pipeline']} {PIPELINE_NODE_EMOJI_MAP['parameters']} {input.name}", # noqa: E501 - "value": input.value, - }, - node_type="input", - ) + for io in self.ios: + vis_pipeline_io_node = ( + self.build_visualization_pipeline_io_node(io) ) + vis_nodes.append(vis_pipeline_io_node) + + try: + # add the source_task_output->pipeline_output connection + assert io.source is not None + vis_source_io_node_name = f"{io.source.node_name}_{io.source.io_type}_{io.source.io_argument_type}_{io.source.io_name}" # noqa: E501 + source_io_to_pipeline_output_connection = ( + self.build_visualization_connection( + source_node_name=vis_source_io_node_name, + target_node_name=vis_pipeline_io_node.id, + ) + ) + vis_connections.append( + source_io_to_pipeline_output_connection + ) + except (AttributeError, AssertionError): + pass - return DagVisualizationItems(connections=connections, nodes=nodes) + return DagVisualizationItems( + connections=vis_connections, nodes=vis_nodes + ) diff --git a/sdk/setup.py b/sdk/setup.py index a38a2c2..42c39a8 100644 --- a/sdk/setup.py +++ b/sdk/setup.py @@ -18,7 +18,7 @@ def get_extra_requirements() -> Dict[str, List[str]]: extra_requirements = { SDKExtras.dashboard.value: [ - "streamlit==1.37.1", + "streamlit==1.40.2", "streamlit-option-menu==0.3.13", "st-pages==0.5.0", "streamlit-extras==0.4.6", diff --git a/sdk/test/unit/conftest.py b/sdk/test/unit/conftest.py deleted file mode 100644 index 7c74782..0000000 --- a/sdk/test/unit/conftest.py +++ /dev/null @@ -1,9231 +0,0 @@ -import datetime -import os -from typing import Callable, List - -import pytest -from bettmensch_ai.pipelines.constants import ResourceType -from bettmensch_ai.pipelines.io import ( - InputArtifact, - InputParameter, - OutputArtifact, - OutputParameter, -) -from pydantic import BaseModel - - -@pytest.fixture -def test_output_dir(): - return os.path.join(".", "sdk", "test", "unit", "outputs") - - -@pytest.fixture -def test_mock_pipeline(): - class MockPipeline: - type = ResourceType.pipeline.value - io_owner_name = ResourceType.pipeline.value - - return MockPipeline() - - -@pytest.fixture -def test_mock_component(): - class MockComponent: - type = ResourceType.component.value - name = "mock-component-0" - io_owner_name = f"{type}.{name}" - - return MockComponent() - - -@pytest.fixture -def test_mock_script(test_function_and_task_inputs): - test_function, _ = test_function_and_task_inputs - - class MockArgument(BaseModel): - name: str - - class MockIO(BaseModel): - parameters: List[MockArgument] - artifacts: List[MockArgument] - - class MockScript: - source: Callable = test_function - add_cwd_to_sys_path: bool = False - - def _build_inputs(self): - - return MockIO( - parameters=[ - MockArgument(name="a"), - MockArgument(name="b"), - MockArgument(name="c"), - ], - artifacts=[MockArgument(name="d")], - ) - - def _build_outputs(self): - - return MockIO( - parameters=[MockArgument(name="a_out")], - artifacts=[MockArgument(name="b_out")], - ) - - return MockScript() - - -@pytest.fixture -def test_function_and_task_inputs(test_mock_pipeline, test_mock_component): - def test_function( - a: InputParameter, - b: InputParameter, - c: InputParameter, - d: InputArtifact, - a_out: OutputParameter, - b_out: OutputArtifact, - ): - pass - - test_input_a = InputParameter("fixed", 1) - test_input_b = InputParameter("mock_pipe_in", 1) - test_input_b.set_owner(test_mock_pipeline) - test_input_c = OutputParameter("mock_comp_out_param") - test_input_c.set_owner(test_mock_component) - test_input_d = OutputArtifact("mock_comp_out_art") - test_input_d.set_owner(test_mock_component) - - task_inputs = { - "a": test_input_a, - "b": test_input_b, - "c": test_input_c, - "d": test_input_d, - } - - return test_function, task_inputs - - -@pytest.fixture -def test_hera_artifact_workflow_template_model(): - class MockHeraWorkflowTemplateModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": None, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2024, 12, 8), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-artifact-pipeline-", - "generation": 1, - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2024, 12, 8), - } - ], - "name": "pipeline-test-artifact-pipeline-d5rzf", - "namespace": "argo", - "owner_references": None, - "resource_version": "9057", - "self_link": None, - "uid": "310b62f6-95fb-418f-ab28-e7070b183979", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "Param A", - "value_from": None, - } - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "convert-to-artifact-0", - "on_exit": None, - "template": "convert-to-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": None, - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "convert-to-artifact-0", - "hooks": None, - "inline": None, - "name": "show-artifact-0", - "on_exit": None, - "template": "show-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "a_art", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "convert-to-artifact", - "node_selector": None, - "outputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a_art", - "optional": None, - "oss": None, - "path": "a_art", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": "a", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "show-artifact", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact("a")\n\ndef show_artifact(a: InputArtifact) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\nshow_artifact(a)', - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": None, - }, - } - - return MockHeraWorkflowTemplateModel() - - -@pytest.fixture -def test_hera_parameter_workflow_template_model(): - class MockHeraWorkflowTemplateModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": None, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2024, 8, 12), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-parameter-pipeline-", - "generation": 1, - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2024, 8, 12), - } - ], - "name": "pipeline-test-parameter-pipeline-mhwgd", - "namespace": "argo", - "owner_references": None, - "resource_version": "9922", - "self_link": None, - "uid": "eb9cff7d-b949-4aa9-9cf6-703b2a602128", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "1", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "a-plus-b-0", - "on_exit": None, - "template": "a-plus-b", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "a-plus-b-0", - "hooks": None, - "inline": None, - "name": "a-plus-b-plus-2-0", - "on_exit": None, - "template": "a-plus-b-plus-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "a-plus-b", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "a-plus-b-plus-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": None, - }, - } - - return MockHeraWorkflowTemplateModel() - - -@pytest.fixture -def test_hera_torch_gpu_workflow_template_model(): - class MockHeraWorkflowTemplateModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": None, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2024, 8, 12), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-torch-gpu-pipeline-", - "generation": 1, - "labels": { - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2024, 8, 12), - } - ], - "name": "pipeline-test-torch-gpu-pipeline-dcfq8", - "namespace": "argo", - "owner_references": None, - "resource_version": "11645", - "self_link": None, - "uid": "1527e48c-6646-4cc4-8a54-edd274467a44", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "None", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "None", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "torch-ddp-create-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "create", - "failure_condition": None, - "flags": None, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "torch-ddp-delete-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "delete", - "failure_condition": None, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo", - ], - "manifest": None, - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "torch-ddp-create-torch-service", - "on_exit": None, - "template": "torch-ddp-create-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0", - "on_exit": None, - "template": "torch-ddp-0", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-1", - "on_exit": None, - "template": "torch-ddp-1", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-2", - "on_exit": None, - "template": "torch-ddp-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-3", - "on_exit": None, - "template": "torch-ddp-3", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "torch-ddp-delete-torch-service", - "on_exit": None, - "template": "torch-ddp-delete-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "show-duration-param-0", - "on_exit": None, - "template": "show-duration-param", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0", - }, - }, - "metrics": None, - "name": "torch-ddp-0", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": None, - "host_port": None, - "name": "ddp", - "protocol": "TCP", - } - ], - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1", - }, - }, - "metrics": None, - "name": "torch-ddp-1", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2", - }, - }, - "metrics": None, - "name": "torch-ddp-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3", - }, - }, - "metrics": None, - "name": "torch-ddp-3", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - } - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "show-duration-param", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": None, - }, - } - - return MockHeraWorkflowTemplateModel() - - -@pytest.fixture -def test_hera_artifact_workflow_model(): - class MockHeraWorkflowModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "True", - "workflows.argoproj.io/pod-name-format": "v2", - }, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2022, 11, 8), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-artifact-pipeline-d5rzf-flow-", - "generation": 6, - "labels": { - "workflows.argoproj.io/completed": "True", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded", - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - ], - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "namespace": "argo", - "owner_references": None, - "resource_version": "18180", - "self_link": None, - "uid": "dc477fa6-dd12-43b7-8511-e3dc03bf023c", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "Second integration test value a", - "value_from": None, - } - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": None, - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": None, - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-artifact-pipeline-d5rzf", - }, - }, - "status": { - "artifact_gc_status": { - "not_specified": True, - "pods_recouped": None, - "strategies_processed": None, - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": None, - "artifactory": None, - "azure": None, - "gcs": None, - "hdfs": None, - "oss": None, - "s3": { - "access_key_secret": None, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": True, - "key_format": None, - "key_prefix": None, - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - }, - "config_map": "artifact-repositories", - "default": None, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - }, - "compressed_nodes": None, - "conditions": [ - { - "message": None, - "status": "False", - "type": "PodRunning", - }, - { - "message": None, - "status": "True", - "type": "Completed", - }, - ], - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "message": None, - "nodes": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9": { - "boundary_id": None, - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393" - ], - "daemoned": None, - "display_name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "inputs": None, - "memoization_status": None, - "message": None, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "node_flag": None, - "outbound_nodes": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" - ], - "outputs": None, - "phase": "Succeeded", - "pod_ip": None, - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 48}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "bettmensch-ai-dag", - "template_ref": None, - "template_scope": "local/", - "type": "DAG", - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554" - ], - "daemoned": None, - "display_name": "show-artifact-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743", - "inputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": "a", - "raw": None, - "recurse_mode": None, - "s3": { - "access_key_secret": None, - "bucket": None, - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": None, - "insecure": None, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - "sub_path": None, - } - ], - "parameters": None, - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 24}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "show-artifact", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1037491743" - ], - "daemoned": None, - "display_name": "convert-to-artifact-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "Second integration test value a", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "a_art", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a_art", - "optional": None, - "oss": None, - "path": "a_art", - "raw": None, - "recurse_mode": None, - "s3": { - "access_key_secret": None, - "bucket": None, - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": None, - "insecure": None, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - "sub_path": None, - } - ], - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 24}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "convert-to-artifact", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "children": None, - "daemoned": None, - "display_name": "show-artifact-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554", - "inputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": "a", - "raw": None, - "recurse_mode": None, - "s3": { - "access_key_secret": None, - "bucket": None, - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": None, - "insecure": None, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - "sub_path": None, - } - ], - "parameters": None, - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.show-artifact-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 24}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "show-artifact", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393": { - "boundary_id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9", - "children": [ - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056" - ], - "daemoned": None, - "display_name": "convert-to-artifact-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-3688018393", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "Second integration test value a", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "a_art", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9.convert-to-artifact-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a_art", - "optional": None, - "oss": None, - "path": "a_art", - "raw": None, - "recurse_mode": None, - "s3": { - "access_key_secret": None, - "bucket": None, - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": None, - "insecure": None, - "key": "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9/pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-convert-to-artifact-1820573056/a_art.tgz", - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - "sub_path": None, - } - ], - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 48}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "convert-to-artifact", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - }, - "offload_node_status_version": None, - "outputs": None, - "persistent_volume_claims": None, - "phase": "Succeeded", - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 48}, - "started_at": datetime.datetime(2022, 11, 8), - "stored_templates": { - "namespaced/pipeline-test-artifact-pipeline-d5rzf/bettmensch-ai-dag": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "convert-to-artifact-0", - "on_exit": None, - "template": "convert-to-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": None, - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "convert-to-artifact-0", - "hooks": None, - "inline": None, - "name": "show-artifact-0", - "on_exit": None, - "template": "show-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/convert-to-artifact": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "a_art", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "convert-to-artifact", - "node_selector": None, - "outputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a_art", - "optional": None, - "oss": None, - "path": "a_art", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-artifact-pipeline-d5rzf/show-artifact": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": "a", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "show-artifact", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact("a")\n\ndef show_artifact(a: InputArtifact) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\nshow_artifact(a)', - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "Second integration test value a", - "value_from": None, - } - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": "argo-workflow", - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "convert-to-artifact-0", - "on_exit": None, - "template": "convert-to-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": None, - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "convert-to-artifact-0", - "hooks": None, - "inline": None, - "name": "show-artifact-0", - "on_exit": None, - "template": "show-artifact", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "a_art", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "convert-to-artifact", - "node_selector": None, - "outputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a_art", - "optional": None, - "oss": None, - "path": "a_art", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\nconvert_to_artifact(a,a_art)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": [ - { - "archive": None, - "archive_logs": None, - "artifact_gc": None, - "artifactory": None, - "azure": None, - "deleted": None, - "from_": None, - "from_expression": None, - "gcs": None, - "git": None, - "global_name": None, - "hdfs": None, - "http": None, - "mode": None, - "name": "a", - "optional": None, - "oss": None, - "path": "a", - "raw": None, - "recurse_mode": None, - "s3": None, - "sub_path": None, - } - ], - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "show-artifact", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.io import InputArtifact\na = InputArtifact("a")\n\ndef show_artifact(a: InputArtifact) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\nshow_artifact(a)', - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-artifact-pipeline-d5rzf", - }, - }, - "synchronization": None, - "task_results_completion_status": { - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-1820573056": True, - "pipeline-test-artifact-pipeline-d5rzf-flow-pmrv9-2313483554": True, - }, - }, - } - - return MockHeraWorkflowModel() - - -@pytest.fixture -def test_hera_parameter_workflow_model(): - class MockHeraWorkflowModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "True", - "workflows.argoproj.io/pod-name-format": "v2", - }, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2022, 11, 8), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-parameter-pipeline-mhwgd-flow-", - "generation": 6, - "labels": { - "workflows.argoproj.io/completed": "True", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded", - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - ], - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "namespace": "argo", - "owner_references": None, - "resource_version": "18503", - "self_link": None, - "uid": "ddfe31ae-1231-4a2d-be6c-4b712bcc15a6", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "-10", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "20", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": None, - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": None, - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-parameter-pipeline-mhwgd", - }, - }, - "status": { - "artifact_gc_status": { - "not_specified": True, - "pods_recouped": None, - "strategies_processed": None, - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": None, - "artifactory": None, - "azure": None, - "gcs": None, - "hdfs": None, - "oss": None, - "s3": { - "access_key_secret": None, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": True, - "key_format": None, - "key_prefix": None, - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - }, - "config_map": "artifact-repositories", - "default": None, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - }, - "compressed_nodes": None, - "conditions": [ - { - "message": None, - "status": "False", - "type": "PodRunning", - }, - { - "message": None, - "status": "True", - "type": "Completed", - }, - ], - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "message": None, - "nodes": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp": { - "boundary_id": None, - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729" - ], - "daemoned": None, - "display_name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "inputs": None, - "memoization_status": None, - "message": None, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "node_flag": None, - "outbound_nodes": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" - ], - "outputs": None, - "phase": "Succeeded", - "pod_ip": None, - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 47}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "bettmensch-ai-dag", - "template_ref": None, - "template_scope": "local/", - "type": "DAG", - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "children": None, - "daemoned": None, - "display_name": "a-plus-b-plus-2-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "10", - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "12", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 23}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "a-plus-b-plus-2", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384" - ], - "daemoned": None, - "display_name": "a-plus-b-plus-2-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "10", - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-plus-2-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "12", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 23}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "a-plus-b-plus-2", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3352155217" - ], - "daemoned": None, - "display_name": "a-plus-b-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "-10", - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "20", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "10", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 24}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "a-plus-b", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729": { - "boundary_id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp", - "children": [ - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680" - ], - "daemoned": None, - "display_name": "a-plus-b-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-4203966729", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "-10", - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "20", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp.a-plus-b-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": "10", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 47}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "a-plus-b", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - }, - "offload_node_status_version": None, - "outputs": None, - "persistent_volume_claims": None, - "phase": "Succeeded", - "progress": "2/2", - "resources_duration": {"cpu": 2, "memory": 47}, - "started_at": datetime.datetime(2022, 11, 8), - "stored_templates": { - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "a-plus-b", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/a-plus-b-plus-2": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "a-plus-b-plus-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-parameter-pipeline-mhwgd/bettmensch-ai-dag": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "a-plus-b-0", - "on_exit": None, - "template": "a-plus-b", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "a-plus-b-0", - "hooks": None, - "inline": None, - "name": "a-plus-b-plus-2-0", - "on_exit": None, - "template": "a-plus-b-plus-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "-10", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "20", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": "argo-workflow", - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{workflow.parameters.a}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "{{workflow.parameters.b}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "a-plus-b-0", - "on_exit": None, - "template": "a-plus-b", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": "2", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "a-plus-b-0", - "hooks": None, - "inline": None, - "name": "a-plus-b-plus-2-0", - "on_exit": None, - "template": "a-plus-b-plus-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "a-plus-b", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "1", - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - }, - { - "default": "2", - "description": None, - "enum": None, - "global_name": None, - "name": "b", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "a-plus-b-plus-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "sum", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "sum", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\nadd_parameters(a,b,sum)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-parameter-pipeline-mhwgd", - }, - }, - "synchronization": None, - "task_results_completion_status": { - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-2921145384": True, - "pipeline-test-parameter-pipeline-mhwgd-flow-hgtcp-3648717680": True, - }, - }, - } - - return MockHeraWorkflowModel() - - -@pytest.fixture -def test_hera_torch_gpu_workflow_model(): - class MockHeraWorkflowModel: - def dict(self): - - return { - "api_version": None, - "kind": None, - "metadata": { - "annotations": { - "karpenter.sh/do-not-disrupt": "True", - "workflows.argoproj.io/pod-name-format": "v2", - }, - "cluster_name": None, - "creation_timestamp": datetime.datetime(2022, 11, 8), - "deletion_grace_period_seconds": None, - "deletion_timestamp": None, - "finalizers": None, - "generate_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-", - "generation": 13, - "labels": { - "workflows.argoproj.io/completed": "True", - "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", - "workflows.argoproj.io/phase": "Succeeded", - }, - "managed_fields": [ - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "argo", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - { - "api_version": "argoproj.io/v1alpha1", - "fields_type": "FieldsV1", - "fields_v1": {}, - "manager": "workflow-controller", - "operation": "Update", - "subresource": None, - "time": datetime.datetime(2022, 11, 8), - }, - ], - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "namespace": "argo", - "owner_references": None, - "resource_version": "13587", - "self_link": None, - "uid": "93098e5d-b8fe-4e2a-83d8-e19b7489c980", - }, - "spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": None, - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": None, - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": None, - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8", - }, - }, - "status": { - "artifact_gc_status": { - "not_specified": True, - "pods_recouped": None, - "strategies_processed": None, - }, - "artifact_repository_ref": { - "artifact_repository": { - "archive_logs": None, - "artifactory": None, - "azure": None, - "gcs": None, - "hdfs": None, - "oss": None, - "s3": { - "access_key_secret": None, - "bucket": "bettmensch-ai-artifact-repository", - "ca_secret": None, - "create_bucket_if_not_present": None, - "encryption_options": None, - "endpoint": "s3.us-east-2.amazonaws.com", - "insecure": True, - "key_format": None, - "key_prefix": None, - "region": None, - "role_arn": None, - "secret_key_secret": None, - "use_sdk_creds": None, - }, - }, - "config_map": "artifact-repositories", - "default": None, - "key": "bettmensch-ai-artifact-repository", - "namespace": "argo", - }, - "compressed_nodes": None, - "conditions": [ - { - "message": None, - "status": "False", - "type": "PodRunning", - }, - { - "message": None, - "status": "True", - "type": "Completed", - }, - ], - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "message": None, - "nodes": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx": { - "boundary_id": None, - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784" - ], - "daemoned": None, - "display_name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "inputs": None, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "node_flag": None, - "outbound_nodes": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811", - ], - "outputs": None, - "phase": "Succeeded", - "pod_ip": None, - "progress": "7/7", - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "bettmensch-ai-dag", - "template_ref": None, - "template_scope": "local/", - "type": "DAG", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": None, - "daemoned": None, - "display_name": "torch-ddp-delete-torch-service", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811", - "inputs": None, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-delete-torch-service", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 0, "memory": 0}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-delete-torch-service", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759" - ], - "daemoned": None, - "display_name": "show-duration-param-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "60", - "value_from": None, - } - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 23}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "show-duration-param", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": None, - "daemoned": None, - "display_name": "torch-ddp-0-worker-1(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-50-242.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-1", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843" - ], - "daemoned": None, - "display_name": "torch-ddp-0-worker-3", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-3", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358" - ], - "daemoned": None, - "display_name": "torch-ddp-0-worker-2", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-2", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1664656268", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811", - ], - "daemoned": None, - "display_name": "torch-ddp-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-49-47.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 15, - "memory": 1112, - "nvidia.com/gpu": 138, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-0", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": None, - "daemoned": None, - "display_name": "torch-ddp-0-worker-3(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-49-43.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-3(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 973, - "nvidia.com/gpu": 120, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-3", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877" - ], - "daemoned": None, - "display_name": "torch-ddp-0-worker-1", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-1", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 14, - "memory": 1013, - "nvidia.com/gpu": 124, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-1", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": None, - "daemoned": None, - "display_name": "torch-ddp-0-worker-2(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-50-184.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0-worker-2(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": { - "cpu": 13, - "memory": 966, - "nvidia.com/gpu": 118, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-2", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-233964726", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-217187107", - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-200409488", - ], - "daemoned": None, - "display_name": "torch-ddp-create-torch-service", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784", - "inputs": None, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-create-torch-service", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 0, "memory": 0}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-create-torch-service", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": [ - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662" - ], - "daemoned": None, - "display_name": "torch-ddp-0", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": None, - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-3686612827", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "None", - "value_from": None, - }, - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.torch-ddp-0", - "node_flag": None, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": "60", - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "3/3", - "resources_duration": { - "cpu": 16, - "memory": 1135, - "nvidia.com/gpu": 138, - }, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "torch-ddp-0", - "template_ref": None, - "template_scope": "local/", - "type": "Retry", - }, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": { - "boundary_id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx", - "children": None, - "daemoned": None, - "display_name": "show-duration-param-0(0)", - "estimated_duration": None, - "finished_at": datetime.datetime(2022, 11, 8), - "host_node_name": "ip-10-0-48-52.us-east-2.compute.internal", - "id": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759", - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "60", - "value_from": None, - } - ], - }, - "memoization_status": None, - "message": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx.show-duration-param-0(0)", - "node_flag": {"hooked": None, "retried": True}, - "outbound_nodes": None, - "outputs": { - "artifacts": None, - "exit_code": "0", - "parameters": None, - "result": None, - }, - "phase": "Succeeded", - "pod_ip": None, - "progress": "1/1", - "resources_duration": {"cpu": 1, "memory": 23}, - "started_at": datetime.datetime(2022, 11, 8), - "synchronization_status": None, - "template_name": "show-duration-param", - "template_ref": None, - "template_scope": "local/", - "type": "Pod", - }, - }, - "offload_node_status_version": None, - "outputs": None, - "persistent_volume_claims": None, - "phase": "Succeeded", - "progress": "7/7", - "resources_duration": { - "cpu": 57, - "memory": 4087, - "nvidia.com/gpu": 500, - }, - "started_at": datetime.datetime(2022, 11, 8), - "stored_templates": { - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/bettmensch-ai-dag": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "torch-ddp-create-torch-service", - "on_exit": None, - "template": "torch-ddp-create-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0", - "on_exit": None, - "template": "torch-ddp-0", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-1", - "on_exit": None, - "template": "torch-ddp-1", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-2", - "on_exit": None, - "template": "torch-ddp-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-3", - "on_exit": None, - "template": "torch-ddp-3", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "torch-ddp-delete-torch-service", - "on_exit": None, - "template": "torch-ddp-delete-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "show-duration-param-0", - "on_exit": None, - "template": "show-duration-param", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/show-duration-param": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - } - ], - }, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "show-duration-param", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-0": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0", - }, - }, - "metrics": None, - "name": "torch-ddp-0", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": None, - "host_port": None, - "name": "ddp", - "protocol": "TCP", - } - ], - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-1": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1", - }, - }, - "metrics": None, - "name": "torch-ddp-1", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-2": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2", - }, - }, - "metrics": None, - "name": "torch-ddp-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-3": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3", - }, - }, - "metrics": None, - "name": "torch-ddp-3", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-create-torch-service": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "torch-ddp-create-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "create", - "failure_condition": None, - "flags": None, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - "namespaced/pipeline-test-torch-gpu-pipeline-dcfq8/torch-ddp-delete-torch-service": { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": {"artifacts": None, "parameters": None}, - "memoize": None, - "metadata": {"annotations": None, "labels": None}, - "metrics": None, - "name": "torch-ddp-delete-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "delete", - "failure_condition": None, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo", - ], - "manifest": None, - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - }, - "stored_workflow_template_spec": { - "active_deadline_seconds": None, - "affinity": None, - "archive_logs": None, - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "12", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "5", - "value_from": None, - }, - ], - }, - "artifact_gc": None, - "artifact_repository_ref": None, - "automount_service_account_token": None, - "dns_config": None, - "dns_policy": None, - "entrypoint": "bettmensch-ai-dag", - "executor": None, - "hooks": None, - "host_aliases": None, - "host_network": None, - "image_pull_secrets": None, - "metrics": None, - "node_selector": None, - "on_exit": None, - "parallelism": None, - "pod_disruption_budget": None, - "pod_gc": None, - "pod_metadata": None, - "pod_priority": None, - "pod_priority_class_name": None, - "pod_spec_patch": None, - "priority": None, - "retry_strategy": None, - "scheduler_name": None, - "security_context": None, - "service_account_name": "argo-workflow", - "shutdown": None, - "suspend": None, - "synchronization": None, - "template_defaults": None, - "templates": [ - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "torch-ddp-create-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "create", - "failure_condition": None, - "flags": None, - "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n namespace: argo\n labels:\n app: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n torch-job: torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6\n torch-node: '0' # Selector for pods associated with this service.\n", - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "torch-ddp-delete-torch-service", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": { - "action": "delete", - "failure_condition": None, - "flags": [ - "service", - "--selector", - "torch-job=torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "-n", - "argo", - ], - "manifest": None, - "manifest_from": None, - "merge_strategy": None, - "set_owner_reference": None, - "success_condition": None, - }, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": { - "fail_fast": None, - "target": None, - "tasks": [ - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": None, - "hooks": None, - "inline": None, - "name": "torch-ddp-create-torch-service", - "on_exit": None, - "template": "torch-ddp-create-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0", - "on_exit": None, - "template": "torch-ddp-0", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-1", - "on_exit": None, - "template": "torch-ddp-1", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-2", - "on_exit": None, - "template": "torch-ddp-2", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": "{{workflow.parameters.n_iter}}", - "value_from": None, - }, - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": "{{workflow.parameters.n_seconds_sleep}}", - "value_from": None, - }, - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-create-torch-service", - "hooks": None, - "inline": None, - "name": "torch-ddp-0-worker-3", - "on_exit": None, - "template": "torch-ddp-3", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": None, - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "torch-ddp-delete-torch-service", - "on_exit": None, - "template": "torch-ddp-delete-torch-service", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - { - "arguments": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", - "value_from": None, - } - ], - }, - "continue_on": None, - "dependencies": None, - "depends": "torch-ddp-0", - "hooks": None, - "inline": None, - "name": "show-duration-param-0", - "on_exit": None, - "template": "show-duration-param", - "template_ref": None, - "when": None, - "with_items": None, - "with_param": None, - "with_sequence": None, - }, - ], - }, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": None, - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "bettmensch-ai-dag", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": None, - "scheduler_name": None, - "script": None, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "0", - }, - }, - "metrics": None, - "name": "torch-ddp-0", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "0", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": [ - { - "container_port": 29200, - "host_ip": None, - "host_port": None, - "name": "ddp", - "protocol": "TCP", - } - ], - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "1", - }, - }, - "metrics": None, - "name": "torch-ddp-1", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "2", - }, - }, - "metrics": None, - "name": "torch-ddp-2", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "2", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": "100", - "description": None, - "enum": None, - "global_name": None, - "name": "n_iter", - "value": None, - "value_from": None, - }, - { - "default": "10", - "description": None, - "enum": None, - "global_name": None, - "name": "n_seconds_sleep", - "value": None, - "value_from": None, - }, - { - "default": "None", - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": None, - }, - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": { - "torch-job": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6", - "torch-node": "3", - }, - }, - "metrics": None, - "name": "torch-ddp-3", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "duration", - "value": None, - "value_from": { - "config_map_key_ref": None, - "default": None, - "event": None, - "expression": None, - "jq_filter": None, - "json_path": None, - "parameter": None, - "path": "duration", - "supplied": None, - }, - } - ], - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": [ - { - "name": "NCCL_DEBUG", - "value": "INFO", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_min_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_nodes", - "value": "4", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_node_rank", - "value": "3", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_nproc_per_node", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_max_restarts", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_start_method", - "value": "fork", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_backend", - "value": "static", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_url", - "value": "torch-ddp-0-c3ee0689-7a0b-4be4-8754-a019d7030eb6.argo.svc.cluster.local", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_rdzv_endpoint_port", - "value": "29200", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_run_id", - "value": "1", - "value_from": None, - }, - { - "name": "bettmensch_ai_distributed_torch_tee", - "value": "0", - "value_from": None, - }, - ], - "env_from": None, - "image": "bettmensch88/bettmensch.ai-torch:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - "requests": { - "cpu": "100m", - "memory": "700Mi", - "nvidia.com/gpu": "1", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\nfrom bettmensch_ai.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef torch_ddp(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import torch\n import torch.distributed as dist\n has_gpu = torch.cuda.is_available()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n a = torch.tensor([dist.get_rank()])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: World size {dist.get_world_size()}')\n print(f'{i}/{n_iter}: Rank {dist.get_rank()}')\n print(f'{i}/{n_iter}: This makes me worker process {dist.get_rank() + 1}/{dist.get_world_size()} globally!')\n if has_gpu:\n device = torch.device('cuda:0')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(0)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom bettmensch_ai.components import torch_distribute\n\ntorch_distribute_decorator=torch_distribute()\ntorch_distributed_function=torch_distribute_decorator(torch_ddp)\n\ntorch_distributed_function(n_iter,n_seconds_sleep,duration)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": [ - { - "effect": "NoSchedule", - "key": "nvidia.com/gpu", - "operator": "Exists", - "toleration_seconds": None, - "value": None, - } - ], - "volumes": None, - }, - { - "active_deadline_seconds": None, - "affinity": None, - "archive_location": None, - "automount_service_account_token": None, - "container": None, - "container_set": None, - "daemon": None, - "dag": None, - "data": None, - "executor": None, - "fail_fast": None, - "host_aliases": None, - "http": None, - "init_containers": None, - "inputs": { - "artifacts": None, - "parameters": [ - { - "default": None, - "description": None, - "enum": None, - "global_name": None, - "name": "a", - "value": None, - "value_from": None, - } - ], - }, - "memoize": None, - "metadata": { - "annotations": None, - "labels": None, - }, - "metrics": None, - "name": "show-duration-param", - "node_selector": None, - "outputs": { - "artifacts": None, - "exit_code": None, - "parameters": None, - "result": None, - }, - "parallelism": None, - "plugin": None, - "pod_spec_patch": None, - "priority": None, - "priority_class_name": None, - "resource": None, - "retry_strategy": { - "affinity": None, - "backoff": None, - "expression": None, - "limit": "1", - "retry_policy": "OnError", - }, - "scheduler_name": None, - "script": { - "args": None, - "command": ["python"], - "env": None, - "env_from": None, - "image": "bettmensch88/bettmensch.ai:3.11-latest", - "image_pull_policy": "Always", - "lifecycle": None, - "liveness_probe": None, - "name": "", - "ports": None, - "readiness_probe": None, - "resources": { - "limits": { - "cpu": "100m", - "memory": "100Mi", - }, - "requests": { - "cpu": "100m", - "memory": "100Mi", - }, - }, - "security_context": None, - "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\nshow_parameter(a)", - "startup_probe": None, - "stdin": None, - "stdin_once": None, - "termination_message_path": None, - "termination_message_policy": None, - "tty": None, - "volume_devices": None, - "volume_mounts": None, - "working_dir": None, - }, - "security_context": None, - "service_account_name": None, - "sidecars": None, - "steps": None, - "suspend": None, - "synchronization": None, - "timeout": None, - "tolerations": None, - "volumes": None, - }, - ], - "tolerations": None, - "ttl_strategy": None, - "volume_claim_gc": None, - "volume_claim_templates": None, - "volumes": None, - "workflow_metadata": None, - "workflow_template_ref": { - "cluster_scope": None, - "name": "pipeline-test-torch-gpu-pipeline-dcfq8", - }, - }, - "synchronization": None, - "task_results_completion_status": { - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1501533811": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-1906221877": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2258088662": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2336401843": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2953909358": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-2966531784": True, - "pipeline-test-torch-gpu-pipeline-dcfq8-flow-2tzsx-842282759": True, - }, - }, - } - - return MockHeraWorkflowModel() diff --git a/sdk/test/unit/pipelines/conftest.py b/sdk/test/unit/pipelines/conftest.py new file mode 100644 index 0000000..02a0d65 --- /dev/null +++ b/sdk/test/unit/pipelines/conftest.py @@ -0,0 +1,102 @@ +import os +from typing import Callable, List + +import pytest +from bettmensch_ai.pipelines.constants import ResourceType +from bettmensch_ai.pipelines.io import ( + InputArtifact, + InputParameter, + OutputArtifact, + OutputParameter, +) +from pydantic import BaseModel + + +@pytest.fixture +def test_output_dir(): + return os.path.join(".", "sdk", "test", "unit", "outputs") + + +@pytest.fixture +def test_mock_pipeline(): + class MockPipeline: + type = ResourceType.pipeline.value + io_owner_name = ResourceType.pipeline.value + + return MockPipeline() + + +@pytest.fixture +def test_mock_component(): + class MockComponent: + type = ResourceType.component.value + name = "mock-component-0" + io_owner_name = f"{type}.{name}" + + return MockComponent() + + +@pytest.fixture +def test_mock_script(test_function_and_task_inputs): + test_function, _ = test_function_and_task_inputs + + class MockArgument(BaseModel): + name: str + + class MockIO(BaseModel): + parameters: List[MockArgument] + artifacts: List[MockArgument] + + class MockScript: + source: Callable = test_function + add_cwd_to_sys_path: bool = False + + def _build_inputs(self): + + return MockIO( + parameters=[ + MockArgument(name="a"), + MockArgument(name="b"), + MockArgument(name="c"), + ], + artifacts=[MockArgument(name="d")], + ) + + def _build_outputs(self): + + return MockIO( + parameters=[MockArgument(name="a_out")], + artifacts=[MockArgument(name="b_out")], + ) + + return MockScript() + + +@pytest.fixture +def test_function_and_task_inputs(test_mock_pipeline, test_mock_component): + def test_function( + a: InputParameter, + b: InputParameter, + c: InputParameter, + d: InputArtifact, + a_out: OutputParameter, + b_out: OutputArtifact, + ): + pass + + test_input_a = InputParameter("fixed", 1) + test_input_b = InputParameter("mock_pipe_in", 1) + test_input_b.set_owner(test_mock_pipeline) + test_input_c = OutputParameter("mock_comp_out_param") + test_input_c.set_owner(test_mock_component) + test_input_d = OutputArtifact("mock_comp_out_art") + test_input_d.set_owner(test_mock_component) + + task_inputs = { + "a": test_input_a, + "b": test_input_b, + "c": test_input_c, + "d": test_input_d, + } + + return test_function, task_inputs diff --git a/sdk/test/unit/server/conftest.py b/sdk/test/unit/server/conftest.py new file mode 100644 index 0000000..69bc6a9 --- /dev/null +++ b/sdk/test/unit/server/conftest.py @@ -0,0 +1,3059 @@ +import os +from datetime import datetime + +import pytest +from hera.workflows.models import Workflow as WorkflowModel +from hera.workflows.models import WorkflowTemplate as WorkflowTemplateModel + + +@pytest.fixture +def test_output_dir(): + return os.path.join(".", "sdk", "test", "unit", "outputs") + + +@pytest.fixture +def test_datetime(): + return datetime(2024, 12, 1) + + +@pytest.fixture +def test_hera_artifact_workflow_template_model(test_datetime): + + return WorkflowTemplateModel( + metadata={ + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-artifact-pipeline-", + "generation": 1, + "labels": { + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + } + ], + "name": "pipeline-test-artifact-pipeline-jx7pb", + "namespace": "argo", + "resource_version": "7515", + "uid": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92", + }, + spec={ + "arguments": {"parameters": [{"name": "a", "value": "Param A"}]}, + "entrypoint": "bettmensch-ai-outer-dag", + "templates": [ + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + } + ] + }, + "name": "convert-to-artifact-0", + "template": "convert-to-artifact", + }, + { + "arguments": { + "artifacts": [ + { + "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", + "name": "a", + } + ] + }, + "depends": "convert-to-artifact-0", + "name": "show-artifact-0", + "template": "show-artifact", + }, + ] + }, + "inputs": { + "parameters": [{"name": "a", "value": "Param A"}] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "artifacts": [ + { + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b", + } + ] + }, + }, + { + "inputs": { + "parameters": [ + {"name": "a"}, + {"default": "null", "name": "a_art"}, + ] + }, + "metadata": {}, + "name": "convert-to-artifact", + "outputs": { + "artifacts": [{"name": "a_art", "path": "a_art"}] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", + }, + }, + { + "inputs": { + "artifacts": [{"name": "a", "path": "a"}], + "parameters": [{"default": "null", "name": "b"}], + }, + "metadata": {}, + "name": "show-artifact", + "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact("a")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact("b")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\n with open(b.path, \'w\') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n', + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + }, + ) + + +@pytest.fixture +def test_hera_parameter_workflow_template_model(test_datetime): + + return WorkflowTemplateModel( + metadata={ + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-parameter-pipeline-", + "generation": 1, + "labels": { + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + } + ], + "name": "pipeline-test-parameter-pipeline-c877j", + "namespace": "argo", + "resource_version": "7640", + "uid": "d2715290-865d-4776-84c4-776632cd7159", + }, + spec={ + "arguments": { + "parameters": [ + {"name": "a", "value": "1"}, + {"name": "b", "value": "2"}, + ] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "templates": [ + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + }, + { + "name": "b", + "value": "{{inputs.parameters.b}}", + }, + ] + }, + "name": "a-plus-b-0", + "template": "a-plus-b", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", + }, + {"name": "b", "value": "2"}, + ] + }, + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "a", "value": "1"}, + {"name": "b", "value": "2"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "parameters": [ + { + "name": "sum", + "value_from": { + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" + }, + } + ] + }, + }, + { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b-plus-2", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + }, + ) + + +@pytest.fixture +def test_hera_torch_gpu_workflow_template_model(test_datetime): + + return WorkflowTemplateModel( + metadata={ + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-torch-gpu-pipeline-", + "generation": 1, + "labels": { + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server" + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + } + ], + "name": "pipeline-test-torch-gpu-pipeline-7c4zp", + "namespace": "argo", + "resource_version": "9578", + "uid": "612226a1-b40f-4f68-92c3-ea8a5d6b3995", + }, + spec={ + "arguments": { + "parameters": [{"name": "n_iter"}, {"name": "n_seconds_sleep"}] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "templates": [ + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n", + }, + }, + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo", + ], + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1", + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", + } + ] + }, + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "n_iter"}, + {"name": "n_seconds_sleep"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {}, + }, + { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0", + } + }, + "name": "torch-ddp-0", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP", + } + ], + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1", + } + }, + "name": "torch-ddp-1", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + { + "inputs": {"parameters": [{"name": "a"}]}, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + }, + ) + + +@pytest.fixture +def test_hera_artifact_workflow_model(test_datetime): + + return WorkflowModel( + metadata={ + "annotations": { + "karpenter.sh/do-not-disrupt": "true", + "workflows.argoproj.io/pod-name-format": "v2", + }, + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-artifact-pipeline-jx7pb-flow-", + "generation": 7, + "labels": { + "bettmensch.ai/pipeline-id": "e2e6b22b-4dfc-413d-ad43-f06a3b03cb92", + "bettmensch.ai/pipeline-name": "pipeline-test-artifact-pipeline-jx7pb", + "workflows.argoproj.io/completed": "true", + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", + "workflows.argoproj.io/phase": "Succeeded", + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + }, + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "workflow-controller", + "operation": "Update", + "time": test_datetime, + }, + ], + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "namespace": "argo", + "resource_version": "7987", + "uid": "e7dd825f-1f8c-4bdf-87ca-b38ae6cd773c", + }, + spec={ + "arguments": { + "parameters": [ + {"name": "a", "value": "First integration test value a"} + ] + }, + "workflow_template_ref": { + "name": "pipeline-test-artifact-pipeline-jx7pb" + }, + }, + status={ + "artifact_gc_status": {"not_specified": True}, + "artifact_repository_ref": { + "artifact_repository": { + "s3": { + "bucket": "bettmensch-ai-artifact-repository", + "endpoint": "s3.us-east-2.amazonaws.com", + "insecure": True, + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}", + } + }, + "config_map": "artifact-repositories", + "key": "bettmensch-ai-artifact-repository", + "namespace": "argo", + }, + "conditions": [ + {"status": "False", "type": "PodRunning"}, + {"status": "True", "type": "Completed"}, + ], + "finished_at": test_datetime, + "nodes": { + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d": { + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876" + ], + "display_name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "finished_at": test_datetime, + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "outbound_nodes": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" + ], + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 68}, + "started_at": test_datetime, + "template_name": "bettmensch-ai-outer-dag", + "template_scope": "local/", + "type": "DAG", + }, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741" + ], + "display_name": "convert-to-artifact-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518", + "inputs": { + "parameters": [ + { + "name": "a", + "value": "First integration test value a", + }, + { + "default": "null", + "name": "a_art", + "value": "null", + }, + ] + }, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0(0)", + "node_flag": {"retried": True}, + "outputs": { + "artifacts": [ + { + "name": "a_art", + "path": "a_art", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" + }, + } + ], + "exit_code": "0", + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 43}, + "started_at": test_datetime, + "template_name": "convert-to-artifact", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "display_name": "show-artifact-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188", + "inputs": { + "artifacts": [ + { + "name": "a", + "path": "a", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" + }, + } + ], + "parameters": [ + {"default": "null", "name": "b", "value": "null"} + ], + }, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0(0)", + "node_flag": {"retried": True}, + "outputs": { + "artifacts": [ + { + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" + }, + } + ], + "exit_code": "0", + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 25}, + "started_at": test_datetime, + "template_name": "show-artifact", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" + ], + "display_name": "show-artifact-0", + "finished_at": test_datetime, + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-170779741", + "inputs": { + "artifacts": [ + { + "name": "a", + "path": "a", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" + }, + } + ], + "parameters": [ + {"default": "null", "name": "b", "value": "null"} + ], + }, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.show-artifact-0", + "outputs": { + "artifacts": [ + { + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" + }, + } + ], + "exit_code": "0", + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 25}, + "started_at": test_datetime, + "template_name": "show-artifact", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518" + ], + "display_name": "convert-to-artifact-0", + "finished_at": test_datetime, + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243", + "inputs": { + "parameters": [ + { + "name": "a", + "value": "First integration test value a", + }, + { + "default": "null", + "name": "a_art", + "value": "null", + }, + ] + }, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag.convert-to-artifact-0", + "outputs": { + "artifacts": [ + { + "name": "a_art", + "path": "a_art", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-convert-to-artifact-1074722518/a_art.tgz" + }, + } + ], + "exit_code": "0", + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 68}, + "started_at": test_datetime, + "template_name": "convert-to-artifact", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876": { + "boundary_id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d", + "children": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1834257243" + ], + "display_name": "bettmensch-ai-inner-dag", + "finished_at": test_datetime, + "id": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-4230836876", + "inputs": { + "parameters": [ + { + "name": "a", + "value": "First integration test value a", + } + ] + }, + "name": "pipeline-test-artifact-pipeline-jx7pb-flow-md47d.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188" + ], + "outputs": { + "artifacts": [ + { + "name": "b", + "path": "b", + "s3": { + "key": "argo-workflows/pipeline-test-artifact-pipeline-jx7pb-flow-md47d/pipeline-test-artifact-pipeline-jx7pb-flow-md47d-show-artifact-1613118188/b.tgz" + }, + } + ] + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 68}, + "started_at": test_datetime, + "template_name": "bettmensch-ai-inner-dag", + "template_scope": "local/", + "type": "DAG", + }, + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 68}, + "started_at": test_datetime, + "stored_templates": { + "namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-inner-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + } + ] + }, + "name": "convert-to-artifact-0", + "template": "convert-to-artifact", + }, + { + "arguments": { + "artifacts": [ + { + "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", + "name": "a", + } + ] + }, + "depends": "convert-to-artifact-0", + "name": "show-artifact-0", + "template": "show-artifact", + }, + ] + }, + "inputs": { + "parameters": [{"name": "a", "value": "Param A"}] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "artifacts": [ + { + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b", + } + ] + }, + }, + "namespaced/pipeline-test-artifact-pipeline-jx7pb/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + "namespaced/pipeline-test-artifact-pipeline-jx7pb/convert-to-artifact": { + "inputs": { + "parameters": [ + {"name": "a"}, + {"default": "null", "name": "a_art"}, + ] + }, + "metadata": {}, + "name": "convert-to-artifact", + "outputs": { + "artifacts": [{"name": "a_art", "path": "a_art"}] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", + }, + }, + "namespaced/pipeline-test-artifact-pipeline-jx7pb/show-artifact": { + "inputs": { + "artifacts": [{"name": "a", "path": "a"}], + "parameters": [{"default": "null", "name": "b"}], + }, + "metadata": {}, + "name": "show-artifact", + "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact("a")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact("b")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\n with open(b.path, \'w\') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n', + }, + }, + }, + "stored_workflow_template_spec": { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "First integration test value a", + } + ] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "service_account_name": "argo-workflow", + "templates": [ + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + } + ] + }, + "name": "convert-to-artifact-0", + "template": "convert-to-artifact", + }, + { + "arguments": { + "artifacts": [ + { + "from_": "{{tasks.convert-to-artifact-0.outputs.artifacts.a_art}}", + "name": "a", + } + ] + }, + "depends": "convert-to-artifact-0", + "name": "show-artifact-0", + "template": "show-artifact", + }, + ] + }, + "inputs": { + "parameters": [{"name": "a", "value": "Param A"}] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "artifacts": [ + { + "from_": "{{tasks.show-artifact-0.outputs.artifacts.b}}", + "name": "b", + } + ] + }, + }, + { + "inputs": { + "parameters": [ + {"name": "a"}, + {"default": "null", "name": "a_art"}, + ] + }, + "metadata": {}, + "name": "convert-to-artifact", + "outputs": { + "artifacts": [{"name": "a_art", "path": "a_art"}] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\na_art = OutputArtifact(\"a_art\")\n\ndef convert_to_artifact(a: InputParameter, a_art: OutputArtifact=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that converts its InputParameter into\n an OutputArtifact.\"\"\"\n with open(a_art.path, 'w') as a_art_file:\n a_art_file.write(str(a))\n\nconvert_to_artifact(a,a_art)\n", + }, + }, + { + "inputs": { + "artifacts": [{"name": "a", "path": "a"}], + "parameters": [{"default": "null", "name": "b"}], + }, + "metadata": {}, + "name": "show-artifact", + "outputs": {"artifacts": [{"name": "b", "path": "b"}]}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": 'import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import InputArtifact\na = InputArtifact("a")\n\nfrom bettmensch_ai.pipelines.io import OutputArtifact\nb = OutputArtifact("b")\n\ndef show_artifact(a: InputArtifact, b: OutputArtifact=None) -> None:\n """When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputArtifact."""\n with open(a.path, \'r\') as a_art_file:\n a_content = a_art_file.read()\n print(f\'Content of input artifact a: {a_content}\')\n with open(b.path, \'w\') as b_art_file:\n b_art_file.write(str(a_content))\n\nshow_artifact(a,b)\n', + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + } + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + "workflow_template_ref": { + "name": "pipeline-test-artifact-pipeline-jx7pb" + }, + }, + "task_results_completion_status": { + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1074722518": True, + "pipeline-test-artifact-pipeline-jx7pb-flow-md47d-1613118188": True, + }, + }, + ) + + +@pytest.fixture +def test_hera_parameter_workflow_model(test_datetime): + + return WorkflowModel( + metadata={ + "annotations": { + "karpenter.sh/do-not-disrupt": "true", + "workflows.argoproj.io/pod-name-format": "v2", + }, + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-parameter-pipeline-c877j-flow-", + "generation": 7, + "labels": { + "bettmensch.ai/pipeline-id": "d2715290-865d-4776-84c4-776632cd7159", + "bettmensch.ai/pipeline-name": "pipeline-test-parameter-pipeline-c877j", + "workflows.argoproj.io/completed": "true", + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", + "workflows.argoproj.io/phase": "Succeeded", + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + }, + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "workflow-controller", + "operation": "Update", + "time": test_datetime, + }, + ], + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "namespace": "argo", + "resource_version": "8018", + "uid": "f4623367-e5c2-4ba7-9a7a-633c55314421", + }, + spec={ + "arguments": { + "parameters": [ + {"name": "a", "value": "-100"}, + {"name": "b", "value": "100"}, + ] + }, + "workflow_template_ref": { + "name": "pipeline-test-parameter-pipeline-c877j" + }, + }, + status={ + "artifact_gc_status": {"not_specified": True}, + "artifact_repository_ref": { + "artifact_repository": { + "s3": { + "bucket": "bettmensch-ai-artifact-repository", + "endpoint": "s3.us-east-2.amazonaws.com", + "insecure": True, + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}", + } + }, + "config_map": "artifact-repositories", + "key": "bettmensch-ai-artifact-repository", + "namespace": "argo", + }, + "conditions": [ + {"status": "False", "type": "PodRunning"}, + {"status": "True", "type": "Completed"}, + ], + "finished_at": test_datetime, + "nodes": { + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn": { + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891" + ], + "display_name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "finished_at": test_datetime, + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "outbound_nodes": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" + ], + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 54}, + "started_at": test_datetime, + "template_name": "bettmensch-ai-outer-dag", + "template_scope": "local/", + "type": "DAG", + }, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn", + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323" + ], + "display_name": "bettmensch-ai-inner-dag", + "finished_at": test_datetime, + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "inputs": { + "parameters": [ + {"name": "a", "value": "-100"}, + {"name": "b", "value": "100"}, + ] + }, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" + ], + "outputs": {"parameters": [{"name": "sum", "value": "2"}]}, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 54}, + "started_at": test_datetime, + "template_name": "bettmensch-ai-inner-dag", + "template_scope": "local/", + "type": "DAG", + }, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911" + ], + "display_name": "a-plus-b-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278", + "inputs": { + "parameters": [ + {"default": "1", "name": "a", "value": "-100"}, + {"default": "2", "name": "b", "value": "100"}, + { + "default": "null", + "name": "sum", + "value": "null", + }, + ] + }, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0(0)", + "node_flag": {"retried": True}, + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "sum", + "value": "0", + "value_from": {"path": "sum"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 28}, + "started_at": test_datetime, + "template_name": "a-plus-b", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770" + ], + "display_name": "a-plus-b-plus-2-0", + "finished_at": test_datetime, + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1697420911", + "inputs": { + "parameters": [ + {"default": "1", "name": "a", "value": "0"}, + {"default": "2", "name": "b", "value": "2"}, + { + "default": "null", + "name": "sum", + "value": "null", + }, + ] + }, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0", + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "sum", + "value": "2", + "value_from": {"path": "sum"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 26}, + "started_at": test_datetime, + "template_name": "a-plus-b-plus-2", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "children": [ + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278" + ], + "display_name": "a-plus-b-0", + "finished_at": test_datetime, + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-3695553323", + "inputs": { + "parameters": [ + {"default": "1", "name": "a", "value": "-100"}, + {"default": "2", "name": "b", "value": "100"}, + { + "default": "null", + "name": "sum", + "value": "null", + }, + ] + }, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-0", + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "sum", + "value": "0", + "value_from": {"path": "sum"}, + } + ], + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 54}, + "started_at": test_datetime, + "template_name": "a-plus-b", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": { + "boundary_id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1140354891", + "display_name": "a-plus-b-plus-2-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770", + "inputs": { + "parameters": [ + {"default": "1", "name": "a", "value": "0"}, + {"default": "2", "name": "b", "value": "2"}, + { + "default": "null", + "name": "sum", + "value": "null", + }, + ] + }, + "name": "pipeline-test-parameter-pipeline-c877j-flow-tfgmn.bettmensch-ai-inner-dag.a-plus-b-plus-2-0(0)", + "node_flag": {"retried": True}, + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "sum", + "value": "2", + "value_from": {"path": "sum"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 26}, + "started_at": test_datetime, + "template_name": "a-plus-b-plus-2", + "template_scope": "local/", + "type": "Pod", + }, + }, + "phase": "Succeeded", + "progress": "2/2", + "resources_duration": {"cpu": 2, "memory": 54}, + "started_at": test_datetime, + "stored_templates": { + "namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b": { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + "namespaced/pipeline-test-parameter-pipeline-c877j/a-plus-b-plus-2": { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b-plus-2", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-inner-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + }, + { + "name": "b", + "value": "{{inputs.parameters.b}}", + }, + ] + }, + "name": "a-plus-b-0", + "template": "a-plus-b", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", + }, + {"name": "b", "value": "2"}, + ] + }, + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "a", "value": "1"}, + {"name": "b", "value": "2"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "parameters": [ + { + "name": "sum", + "value_from": { + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" + }, + } + ] + }, + }, + "namespaced/pipeline-test-parameter-pipeline-c877j/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + }, + "stored_workflow_template_spec": { + "arguments": { + "parameters": [ + {"name": "a", "value": "-100"}, + {"name": "b", "value": "100"}, + ] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "service_account_name": "argo-workflow", + "templates": [ + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{inputs.parameters.a}}", + }, + { + "name": "b", + "value": "{{inputs.parameters.b}}", + }, + ] + }, + "name": "a-plus-b-0", + "template": "a-plus-b", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.a-plus-b-0.outputs.parameters.sum}}", + }, + {"name": "b", "value": "2"}, + ] + }, + "depends": "a-plus-b-0", + "name": "a-plus-b-plus-2-0", + "template": "a-plus-b-plus-2", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "a", "value": "1"}, + {"name": "b", "value": "2"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": { + "parameters": [ + { + "name": "sum", + "value_from": { + "parameter": "{{tasks.a-plus-b-plus-2-0.outputs.parameters.sum}}" + }, + } + ] + }, + }, + { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + { + "inputs": { + "parameters": [ + {"default": "1", "name": "a"}, + {"default": "2", "name": "b"}, + {"default": "null", "name": "sum"}, + ] + }, + "metadata": {}, + "name": "a-plus-b-plus-2", + "outputs": { + "parameters": [ + {"name": "sum", "value_from": {"path": "sum"}} + ] + }, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\ntry: b = json.loads(r'''{{inputs.parameters.b}}''')\nexcept: b = r'''{{inputs.parameters.b}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nsum = OutputParameter(\"sum\")\n\ndef add_parameters(a: InputParameter=1, b: InputParameter=2, sum: OutputParameter=None) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a simple addition bettmensch_ai.Component.\"\"\"\n sum.assign(a + b)\n\nadd_parameters(a,b,sum)\n", + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{workflow.parameters.a}}", + }, + { + "name": "b", + "value": "{{workflow.parameters.b}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + "workflow_template_ref": { + "name": "pipeline-test-parameter-pipeline-c877j" + }, + }, + "task_results_completion_status": { + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-1412890278": True, + "pipeline-test-parameter-pipeline-c877j-flow-tfgmn-4267990770": True, + }, + }, + ) + + +@pytest.fixture +def test_hera_torch_gpu_workflow_model(test_datetime): + + return WorkflowModel( + metadata={ + "annotations": { + "karpenter.sh/do-not-disrupt": "true", + "workflows.argoproj.io/pod-name-format": "v2", + }, + "creation_timestamp": test_datetime, + "generate_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-", + "generation": 13, + "labels": { + "bettmensch.ai/pipeline-id": "612226a1-b40f-4f68-92c3-ea8a5d6b3995", + "bettmensch.ai/pipeline-name": "pipeline-test-torch-gpu-pipeline-7c4zp", + "workflows.argoproj.io/completed": "true", + "workflows.argoproj.io/creator": "system-serviceaccount-argo-argo-server", + "workflows.argoproj.io/phase": "Succeeded", + }, + "managed_fields": [ + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "argo", + "operation": "Update", + "time": test_datetime, + }, + { + "api_version": "argoproj.io/v1alpha1", + "fields_type": "FieldsV1", + "fields_v1": {}, + "manager": "workflow-controller", + "operation": "Update", + "time": test_datetime, + }, + ], + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "namespace": "argo", + "resource_version": "11463", + "uid": "ae69b1e3-a235-44d5-8667-bef63fc15821", + }, + spec={ + "arguments": { + "parameters": [ + {"name": "n_iter", "value": "15"}, + {"name": "n_seconds_sleep", "value": "2"}, + ] + }, + "workflow_template_ref": { + "name": "pipeline-test-torch-gpu-pipeline-7c4zp" + }, + }, + status={ + "artifact_gc_status": {"not_specified": True}, + "artifact_repository_ref": { + "artifact_repository": { + "s3": { + "bucket": "bettmensch-ai-artifact-repository", + "endpoint": "s3.us-east-2.amazonaws.com", + "insecure": True, + "key_format": "argo-workflows/{{workflow.name}}/{{pod.name}}", + } + }, + "config_map": "artifact-repositories", + "key": "bettmensch-ai-artifact-repository", + "namespace": "argo", + }, + "conditions": [ + {"status": "False", "type": "PodRunning"}, + {"status": "True", "type": "Completed"}, + ], + "finished_at": test_datetime, + "nodes": { + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf": { + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060" + ], + "display_name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "finished_at": test_datetime, + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "outbound_nodes": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", + ], + "phase": "Succeeded", + "progress": "5/5", + "resources_duration": { + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190, + }, + "started_at": test_datetime, + "template_name": "bettmensch-ai-outer-dag", + "template_scope": "local/", + "type": "DAG", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "torch-ddp-delete-torch-ddp-service", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-48-85.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-delete-torch-ddp-service", + "outputs": {"exit_code": "0"}, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 0, "memory": 0}, + "started_at": test_datetime, + "template_name": "torch-ddp-delete-torch-ddp-service", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", + ], + "display_name": "torch-ddp-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-50-210.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15", + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2", + }, + { + "default": "null", + "name": "duration", + "value": "null", + }, + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0(0)", + "node_flag": {"retried": True}, + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": {"path": "duration"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 11, + "memory": 839, + "nvidia.com/gpu": 99, + }, + "started_at": test_datetime, + "template_name": "torch-ddp-0", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059", + ], + "display_name": "torch-ddp-create-torch-ddp-service", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252", + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-create-torch-ddp-service", + "outputs": {"exit_code": "0"}, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 0, "memory": 1}, + "started_at": test_datetime, + "template_name": "torch-ddp-create-torch-ddp-service", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694" + ], + "display_name": "show-duration-param-0", + "finished_at": test_datetime, + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2733896051", + "inputs": {"parameters": [{"name": "a", "value": "30"}]}, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0", + "outputs": {"exit_code": "0"}, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 27}, + "started_at": test_datetime, + "template_name": "show-duration-param", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430" + ], + "display_name": "torch-ddp-0-worker-1", + "finished_at": test_datetime, + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-4097461059", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15", + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2", + }, + { + "default": "null", + "name": "duration", + "value": "null", + }, + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1", + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": {"path": "duration"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 11, + "memory": 777, + "nvidia.com/gpu": 91, + }, + "started_at": test_datetime, + "template_name": "torch-ddp-1", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252" + ], + "display_name": "bettmensch-ai-inner-dag", + "finished_at": test_datetime, + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "inputs": { + "parameters": [ + {"name": "n_iter", "value": "15"}, + {"name": "n_seconds_sleep", "value": "2"}, + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag", + "outbound_nodes": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231", + ], + "phase": "Succeeded", + "progress": "5/5", + "resources_duration": { + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190, + }, + "started_at": test_datetime, + "template_name": "bettmensch-ai-inner-dag", + "template_scope": "local/", + "type": "DAG", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "torch-ddp-0-worker-1(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-50-218.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15", + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2", + }, + { + "default": "null", + "name": "duration", + "value": "null", + }, + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0-worker-1(0)", + "node_flag": {"retried": True}, + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": {"path": "duration"}, + } + ], + }, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": { + "cpu": 11, + "memory": 777, + "nvidia.com/gpu": 91, + }, + "started_at": test_datetime, + "template_name": "torch-ddp-1", + "template_scope": "local/", + "type": "Pod", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "children": [ + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387" + ], + "display_name": "torch-ddp-0", + "finished_at": test_datetime, + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-47634872", + "inputs": { + "parameters": [ + { + "default": "100", + "name": "n_iter", + "value": "15", + }, + { + "default": "10", + "name": "n_seconds_sleep", + "value": "2", + }, + { + "default": "null", + "name": "duration", + "value": "null", + }, + ] + }, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.torch-ddp-0", + "outputs": { + "exit_code": "0", + "parameters": [ + { + "name": "duration", + "value": "30", + "value_from": {"path": "duration"}, + } + ], + }, + "phase": "Succeeded", + "progress": "3/3", + "resources_duration": { + "cpu": 12, + "memory": 866, + "nvidia.com/gpu": 99, + }, + "started_at": test_datetime, + "template_name": "torch-ddp-0", + "template_scope": "local/", + "type": "Retry", + }, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": { + "boundary_id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-414716060", + "display_name": "show-duration-param-0(0)", + "finished_at": test_datetime, + "host_node_name": "ip-10-0-49-235.us-east-2.compute.internal", + "id": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694", + "inputs": {"parameters": [{"name": "a", "value": "30"}]}, + "name": "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf.bettmensch-ai-inner-dag.show-duration-param-0(0)", + "node_flag": {"retried": True}, + "outputs": {"exit_code": "0"}, + "phase": "Succeeded", + "progress": "1/1", + "resources_duration": {"cpu": 1, "memory": 27}, + "started_at": test_datetime, + "template_name": "show-duration-param", + "template_scope": "local/", + "type": "Pod", + }, + }, + "phase": "Succeeded", + "progress": "5/5", + "resources_duration": { + "cpu": 23, + "memory": 1644, + "nvidia.com/gpu": 190, + }, + "started_at": test_datetime, + "stored_templates": { + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-inner-dag": { + "dag": { + "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1", + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", + } + ] + }, + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "n_iter"}, + {"name": "n_seconds_sleep"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {}, + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/bettmensch-ai-outer-dag": { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/show-duration-param": { + "inputs": {"parameters": [{"name": "a"}]}, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", + }, + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-0": { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0", + } + }, + "name": "torch-ddp-0", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP", + } + ], + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", # noqa: E501 + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-1": { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1", + } + }, + "name": "torch-ddp-1", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-create-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n", + }, + }, + "namespaced/pipeline-test-torch-gpu-pipeline-7c4zp/torch-ddp-delete-torch-ddp-service": { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo", + ], + }, + }, + }, + "stored_workflow_template_spec": { + "arguments": { + "parameters": [ + {"name": "n_iter", "value": "15"}, + {"name": "n_seconds_sleep", "value": "2"}, + ] + }, + "entrypoint": "bettmensch-ai-outer-dag", + "service_account_name": "argo-workflow", + "templates": [ + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-create-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "create", + "manifest": "apiVersion: v1\nkind: Service\nmetadata:\n name: torch-ddp-0-{{workflow.uid}}\n namespace: argo\n labels:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\nspec:\n clusterIP: None # ClusterIP set to None for headless service.\n ports:\n - name: ddp # Port for torchrun master<->worker node coms.\n port: 29200\n targetPort: 29200\n selector:\n workflows.argoproj.io/workflow: {{workflow.name}}\n torch-job: torch-ddp-0\n torch-node: '0' # Selector for pods associated with this service.\n", + }, + }, + { + "inputs": {}, + "metadata": {}, + "name": "torch-ddp-delete-torch-ddp-service", + "outputs": {}, + "resource": { + "action": "delete", + "flags": [ + "service", + "--selector", + "torch-job=torch-ddp-0,workflows.argoproj.io/workflow={{workflow.name}}", + "-n", + "argo", + ], + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": {}, + "name": "torch-ddp-create-torch-ddp-service", + "template": "torch-ddp-create-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0", + "template": "torch-ddp-0", + }, + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{inputs.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{inputs.parameters.n_seconds_sleep}}", + }, + ] + }, + "depends": "torch-ddp-create-torch-ddp-service", + "name": "torch-ddp-0-worker-1", + "template": "torch-ddp-1", + }, + { + "arguments": {}, + "depends": "torch-ddp-0", + "name": "torch-ddp-delete-torch-ddp-service", + "template": "torch-ddp-delete-torch-ddp-service", + }, + { + "arguments": { + "parameters": [ + { + "name": "a", + "value": "{{tasks.torch-ddp-0.outputs.parameters.duration}}", + } + ] + }, + "depends": "torch-ddp-0", + "name": "show-duration-param-0", + "template": "show-duration-param", + }, + ] + }, + "inputs": { + "parameters": [ + {"name": "n_iter"}, + {"name": "n_seconds_sleep"}, + ] + }, + "metadata": {}, + "name": "bettmensch-ai-inner-dag", + "outputs": {}, + }, + { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "0", + } + }, + "name": "torch-ddp-0", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "0", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "ports": [ + { + "container_port": 29200, + "name": "ddp", + "protocol": "TCP", + } + ], + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + { + "inputs": { + "parameters": [ + {"default": "100", "name": "n_iter"}, + {"default": "10", "name": "n_seconds_sleep"}, + {"default": "null", "name": "duration"}, + ] + }, + "metadata": { + "labels": { + "torch-job": "torch-ddp-0", + "torch-node": "1", + } + }, + "name": "torch-ddp-1", + "outputs": { + "parameters": [ + { + "name": "duration", + "value_from": {"path": "duration"}, + } + ] + }, + "pod_spec_patch": "topologySpreadConstraints:\n- maxSkew: 1\n topologyKey: kubernetes.io/hostname\n whenUnsatisfiable: DoNotSchedule\n labelSelector:\n matchExpressions:\n - { key: torch-node, operator: In, values: ['0','1','2','3','4','5']}", + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "env": [ + {"name": "NCCL_DEBUG", "value": "INFO"}, + { + "name": "bettmensch_ai_torch_ddp_min_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_max_nodes", + "value": "2", + }, + { + "name": "bettmensch_ai_torch_ddp_node_rank", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_nproc_per_node", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_max_restarts", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_start_method", + "value": "fork", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_backend", + "value": "static", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_url", + "value": "torch-ddp-0-{{workflow.uid}}.argo.svc.cluster.local", + }, + { + "name": "bettmensch_ai_torch_ddp_rdzv_endpoint_port", + "value": "29200", + }, + { + "name": "bettmensch_ai_torch_ddp_run_id", + "value": "1", + }, + { + "name": "bettmensch_ai_torch_ddp_tee", + "value": "0", + }, + ], + "image": "bettmensch88/bettmensch.ai-pytorch:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + "requests": { + "cpu": "100m", + "memory": "700Mi", + "nvidia.com/gpu": "1", + }, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: n_iter = json.loads(r'''{{inputs.parameters.n_iter}}''')\nexcept: n_iter = r'''{{inputs.parameters.n_iter}}'''\ntry: n_seconds_sleep = json.loads(r'''{{inputs.parameters.n_seconds_sleep}}''')\nexcept: n_seconds_sleep = r'''{{inputs.parameters.n_seconds_sleep}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\nfrom bettmensch_ai.pipelines.io import OutputParameter\nduration = OutputParameter(\"duration\")\n\ndef tensor_reduce(n_iter: InputParameter=100, n_seconds_sleep: InputParameter=10, duration: OutputParameter=None) -> None:\n \"\"\"When decorated with the torch_component decorator, implements a\n bettmensch_ai.TorchComponent that runs a torch DDP across pods and nodes in\n your K8s cluster.\"\"\"\n import time\n from datetime import datetime as dt\n import GPUtil\n import torch\n import torch.distributed as dist\n from bettmensch_ai.pipelines.component.torch_ddp import LaunchContext\n has_gpu = torch.cuda.is_available()\n ddp_context = LaunchContext()\n print(f'GPU present: {has_gpu}')\n if has_gpu:\n dist.init_process_group(backend='nccl')\n else:\n dist.init_process_group(backend='gloo')\n for i in range(1, n_iter + 1):\n time.sleep(n_seconds_sleep)\n GPUtil.showUtilization()\n a = torch.tensor([ddp_context.rank])\n print(f'{i}/{n_iter}: @{dt.now()}')\n print(f'{i}/{n_iter}: Backend {dist.get_backend()}')\n print(f'{i}/{n_iter}: Global world size: {ddp_context.world_size}')\n print(f'{i}/{n_iter}: Global worker process rank: {ddp_context.rank}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.rank + 1}/{ddp_context.world_size} globally!')\n print(f'{i}/{n_iter}: Local rank of worker: {ddp_context.local_rank}')\n print(f'{i}/{n_iter}: Local world size: {ddp_context.local_world_size}')\n print(f'{i}/{n_iter}: This makes me worker process {ddp_context.local_rank + 1}/{ddp_context.local_world_size} locally!')\n print(f'{i}/{n_iter}: Node/pod rank: {ddp_context.group_rank}')\n if has_gpu:\n device = torch.device(f'cuda:{ddp_context.local_rank}')\n device_count = torch.cuda.device_count()\n print(f'{i}/{n_iter}: GPU count: {device_count}')\n device_name = torch.cuda.get_device_name(ddp_context.local_rank)\n print(f'{i}/{n_iter}: GPU name: {device_name}')\n device_property = torch.cuda.get_device_capability(device)\n print(f'{i}/{n_iter}: GPU property: {device_property}')\n else:\n device = torch.device('cpu')\n a_placed = a.to(device)\n print(f'{i}/{n_iter}: Pre-`all_reduce` tensor: {a_placed}')\n dist.all_reduce(a_placed)\n print(f'{i}/{n_iter}: Post-`all_reduce` tensor: {a_placed}')\n print('===================================================')\n if duration is not None:\n duration_seconds = n_iter * n_seconds_sleep\n duration.assign(duration_seconds)\n\nfrom torch.distributed.elastic.multiprocessing.errors import record\n\ntensor_reduce=record(tensor_reduce)\n\nfrom bettmensch_ai.pipelines.component import as_torch_ddp\n\ntorch_ddp_decorator=as_torch_ddp()\n\ntorch_ddp_function=torch_ddp_decorator(tensor_reduce)\n\n\ntorch_ddp_function(n_iter,n_seconds_sleep,duration)", + }, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "nvidia.com/gpu", + "operator": "Exists", + } + ], + }, + { + "inputs": {"parameters": [{"name": "a"}]}, + "metadata": {}, + "name": "show-duration-param", + "outputs": {}, + "retry_strategy": { + "limit": "1", + "retry_policy": "OnError", + }, + "script": { + "command": ["python"], + "image": "bettmensch88/bettmensch.ai-standard:3.11-latest", + "image_pull_policy": "Always", + "name": "", + "resources": { + "limits": {"cpu": "100m", "memory": "100Mi"}, + "requests": {"cpu": "100m", "memory": "100Mi"}, + }, + "source": "import os\nimport sys\nsys.path.append(os.getcwd())\n\n# --- preprocessing\nimport json\ntry: a = json.loads(r'''{{inputs.parameters.a}}''')\nexcept: a = r'''{{inputs.parameters.a}}'''\n\nfrom bettmensch_ai.pipelines.io import InputParameter\n\ndef show_parameter(a: InputParameter) -> None:\n \"\"\"When decorated with the bettmensch_ai.components.component decorator,\n implements a bettmensch_ai.Component that prints the values of its\n InputParameter.\"\"\"\n print(f'Content of input parameter a is: {a}')\n\nshow_parameter(a)\n", + }, + }, + { + "dag": { + "tasks": [ + { + "arguments": { + "parameters": [ + { + "name": "n_iter", + "value": "{{workflow.parameters.n_iter}}", + }, + { + "name": "n_seconds_sleep", + "value": "{{workflow.parameters.n_seconds_sleep}}", + }, + ] + }, + "name": "bettmensch-ai-inner-dag", + "template": "bettmensch-ai-inner-dag", + } + ] + }, + "inputs": {}, + "metadata": {}, + "name": "bettmensch-ai-outer-dag", + "outputs": {}, + }, + ], + "workflow_template_ref": { + "name": "pipeline-test-torch-gpu-pipeline-7c4zp" + }, + }, + "task_results_completion_status": { + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1368447231": True, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-1861925387": True, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-2020597252": True, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-41628430": True, + "pipeline-test-torch-gpu-pipeline-7c4zp-flow-9ldcf-947069694": True, + }, + }, + ) diff --git a/test_api_models.py b/test_api_models.py index db377a9..1ce929c 100644 --- a/test_api_models.py +++ b/test_api_models.py @@ -24,6 +24,30 @@ os.makedirs(DIR) +def recursive_non_null_dict(data): + + if isinstance(data, dict): + non_null_data = {} + + for k, v in data.items(): + if v is None: + print(f"Key {k} | None-Value {v}") + pass + elif isinstance(v, (dict, list, tuple)): + non_null_data[k] = recursive_non_null_dict(v) + else: + print(f"Key {k} | Value {v}") + non_null_data[k] = v + + return non_null_data + elif isinstance(data, list): + return [recursive_non_null_dict(data_i) for data_i in data] + elif isinstance(data, tuple): + return (recursive_non_null_dict(data_i) for data_i in data) + else: + return data + + def recursive_datetime_to_string(data): if isinstance(data, dict): return dict( @@ -34,7 +58,7 @@ def recursive_datetime_to_string(data): elif isinstance(data, tuple): return (recursive_datetime_to_string(data_i) for data_i in data) elif isinstance(data, datetime.datetime): - return "test-datetime-value" + return "07/12/2024" else: return data @@ -61,8 +85,8 @@ def recursive_datetime_to_string(data): for i, argo_workflow_template in enumerate( argo_workflow_template_list_response.items ): - argo_workflow_template_dict = recursive_datetime_to_string( - argo_workflow_template.to_dict() + argo_workflow_template_dict = recursive_non_null_dict( + recursive_datetime_to_string(argo_workflow_template.to_dict()) ) with open( f"{ARGO_WORKFLOW_TEMPLATE_MODELS_DIR}/argo_workflow_template_{i}.json", @@ -72,7 +96,9 @@ def recursive_datetime_to_string(data): # export argo workflow for i, argo_workflow in enumerate(argo_workflow_list_response.items): - argo_workflow_dict = recursive_datetime_to_string(argo_workflow.to_dict()) + argo_workflow_dict = recursive_non_null_dict( + recursive_datetime_to_string(argo_workflow.to_dict()) + ) with open( f"{ARGO_WORKFLOW_MODELS_DIR}/argo_workflow_{i}.json", "w" ) as argo_workflow_file: @@ -92,8 +118,8 @@ def recursive_datetime_to_string(data): for i, hera_workflow_template in enumerate( hera_workflow_template_list_response.items ): - hera_workflow_template_dict = recursive_datetime_to_string( - hera_workflow_template.dict() + hera_workflow_template_dict = recursive_non_null_dict( + recursive_datetime_to_string(hera_workflow_template.dict()) ) with open( f"{HERA_WORKFLOW_TEMPLATE_MODELS_DIR}/hera_workflow_template_{i}.json", @@ -103,7 +129,9 @@ def recursive_datetime_to_string(data): # export hera workflow for i, hera_workflow in enumerate(hera_workflow_list_response.items): - hera_workflow_dict = recursive_datetime_to_string(hera_workflow.dict()) + hera_workflow_dict = recursive_non_null_dict( + recursive_datetime_to_string(hera_workflow.dict()) + ) with open( f"{HERA_WORKFLOW_MODELS_DIR}/hera_workflow_{i}.json", "w" ) as hera_workflow_file: