Fix parametric instability at container start (#3359)

DataDog · Nov 4, 2024 · ad46fba · ad46fba
1 parent 41807c2
commit ad46fba
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 133 deletions.
diff --git a/docs/scenarios/parametric.md b/docs/scenarios/parametric.md
@@ -162,12 +162,6 @@ library. Deleting the image will force a rebuild which will resolve the issue.
 docker image rm <library>-test-library
 ```
 
-### Port conflict on 50052
-
-If there is a port conflict with an existing process on the local machine then the default port `50052` can be
-overridden using `APM_LIBRARY_SERVER_PORT`.
-
-
 ### Disable build kit
 
 If logs like

diff --git a/tests/parametric/conftest.py b/tests/parametric/conftest.py
@@ -9,7 +9,6 @@
 from typing import Dict, Generator, List, TextIO, TypedDict
 import urllib.parse
 
-from docker.models.containers import Container
 import requests
 import pytest
 
@@ -387,44 +386,6 @@ def wait_for_tracer_flare(self, case_id: str = None, clear: bool = False, wait_l
         raise AssertionError("No tracer-flare received")
 
 
-@contextlib.contextmanager
-def docker_run(
-    image: str,
-    name: str,
-    cmd: List[str],
-    env: Dict[str, str],
-    volumes: Dict[str, str],
-    host_port: int,
-    container_port: int,
-    log_file: TextIO,
-    network_name: str,
-) -> Generator[Container, None, None]:
-
-    # Run the docker container
-    logger.info(f"Starting {name}")
-
-    container = scenarios.parametric.docker_run(
-        image,
-        name=name,
-        env=env,
-        volumes=volumes,
-        network=network_name,
-        host_port=host_port,
-        container_port=container_port,
-        command=cmd,
-    )
-
-    try:
-        yield container
-    finally:
-        logger.info(f"Stopping {name}")
-        container.stop(timeout=1)
-        logs = container.logs()
-        log_file.write(logs.decode("utf-8"))
-        log_file.flush()
-        container.remove(force=True)
-
-
 @pytest.fixture(scope="session")
 def docker() -> str:
     """Fixture to ensure docker is ready to use on the system."""
@@ -515,28 +476,27 @@ def test_agent(
     # (trace_content_length) go client doesn't submit content length header
     env["ENABLED_CHECKS"] = "trace_count_header"
 
-    host_port = scenarios.parametric.get_host_port(worker_id, 50000)
+    host_port = scenarios.parametric.get_host_port(worker_id, 4600)
 
-    with docker_run(
+    with scenarios.parametric.docker_run(
         image=scenarios.parametric.TEST_AGENT_IMAGE,
         name=test_agent_container_name,
-        cmd=[],
+        command=[],
         env=env,
         volumes={f"{os.getcwd()}/snapshots": "/snapshots"},
         host_port=host_port,
         container_port=test_agent_port,
         log_file=test_agent_log_file,
-        network_name=docker_network,
+        network=docker_network,
     ):
-        logger.debug(f"Test agent {test_agent_container_name} started on host port {host_port}")
 
         client = _TestAgentAPI(base_url=f"http://localhost:{host_port}", pytest_request=request)
         time.sleep(0.2)  # intial wait time, the trace agent takes 200ms to start
         for _ in range(100):
             try:
                 resp = client.info()
-            except:
-                logger.debug("Wait for 0.1s for the test agent to be ready")
+            except Exception as e:
+                logger.debug(f"Wait for 0.1s for the test agent to be ready {e}")
                 time.sleep(0.1)
             else:
                 if resp["version"] != "test":
@@ -547,8 +507,7 @@ def test_agent(
                 logger.info("Test agent is ready")
                 break
         else:
-            with open(test_agent_log_file.name) as f:
-                logger.error(f"Could not connect to test agent: {f.read()}")
+            logger.error("Could not connect to test agent")
             pytest.fail(
                 f"Could not connect to test agent, check the log file {test_agent_log_file.name}.", pytrace=False
             )
@@ -568,14 +527,15 @@ def test_agent(
 
 
 @pytest.fixture
-def test_server(
+def test_library(
     worker_id: str,
     docker_network: str,
     test_agent_port: str,
     test_agent_container_name: str,
     apm_test_server: APMLibraryTestServer,
     test_server_log_file: TextIO,
-):
+) -> Generator[APMLibrary, None, None]:
+
     env = {
         "DD_TRACE_DEBUG": "true",
         "DD_TRACE_AGENT_URL": f"http://{test_agent_container_name}:{test_agent_port}",
@@ -590,38 +550,35 @@ def test_server(
             test_server_env[k] = v
     env.update(test_server_env)
 
-    apm_test_server.host_port = scenarios.parametric.get_host_port(worker_id, 51000)
+    apm_test_server.host_port = scenarios.parametric.get_host_port(worker_id, 4500)
 
-    with docker_run(
+    with scenarios.parametric.docker_run(
         image=apm_test_server.container_tag,
         name=apm_test_server.container_name,
-        cmd=apm_test_server.container_cmd,
+        command=apm_test_server.container_cmd,
         env=env,
         host_port=apm_test_server.host_port,
         container_port=apm_test_server.container_port,
         volumes=apm_test_server.volumes,
         log_file=test_server_log_file,
-        network_name=docker_network,
+        network=docker_network,
     ) as container:
-        logger.debug(f"Test server {apm_test_server.container_name} started on host port {apm_test_server.host_port}")
         apm_test_server.container = container
-        yield apm_test_server
 
+        test_server_timeout = 60
 
-@pytest.fixture
-def test_library(test_server: APMLibraryTestServer) -> Generator[APMLibrary, None, None]:
-    test_server_timeout = 60
-
-    if test_server.host_port is None:
-        raise RuntimeError("Internal error, no port has been assigned", 1)
+        if apm_test_server.host_port is None:
+            raise RuntimeError("Internal error, no port has been assigned", 1)
 
-    if test_server.protocol == "grpc":
-        client = APMLibraryClientGRPC(f"localhost:{test_server.host_port}", test_server_timeout, test_server.container)
-    elif test_server.protocol == "http":
-        client = APMLibraryClientHTTP(
-            f"http://localhost:{test_server.host_port}", test_server_timeout, test_server.container
-        )
-    else:
-        raise ValueError(f"Interface {test_server.protocol} not supported")
-    tracer = APMLibrary(client, test_server.lang)
-    yield tracer
+        if apm_test_server.protocol == "grpc":
+            client = APMLibraryClientGRPC(
+                f"localhost:{apm_test_server.host_port}", test_server_timeout, apm_test_server.container
+            )
+        elif apm_test_server.protocol == "http":
+            client = APMLibraryClientHTTP(
+                f"http://localhost:{apm_test_server.host_port}", test_server_timeout, apm_test_server.container
+            )
+        else:
+            raise ValueError(f"Interface {apm_test_server.protocol} not supported")
+        tracer = APMLibrary(client, apm_test_server.lang)
+        yield tracer
diff --git a/tests/parametric/test_dynamic_configuration.py b/tests/parametric/test_dynamic_configuration.py
@@ -846,6 +846,7 @@ def test_trace_sampling_rules_with_tags(self, test_agent, test_library):
     @bug(library="cpp", reason="unknown")
     @bug(library="ruby", reason="To be investigated")
     @bug(context.library > "[email protected]", reason="APMAPI-833")
+    @bug(library="python", reason="APMAPI-857")
     @parametrize("library_env", [{**DEFAULT_ENVVARS}])
     def test_remote_sampling_rules_retention(self, library_env, test_agent, test_library):
         """Only the last set of sampling rules should be applied"""

diff --git a/tests/parametric/test_library_tracestats.py b/tests/parametric/test_library_tracestats.py
@@ -104,7 +104,7 @@ def test_metrics_msgpack_serialization_TS001(self, library_env, test_agent, test
     @missing_feature(context.library == "nodejs", reason="nodejs has not implemented stats computation yet")
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
-    def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_library, test_server):
+    def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_library):
         """
         When spans are created with a unique set of dimensions
             Each span has stats computed for it and is in its own bucket
@@ -160,7 +160,7 @@ def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_libr
             ) as span:
                 span.set_meta(key="http.status_code", val="400")
 
-        if test_server.lang == "golang":
+        if test_library.lang == "golang":
             test_library.flush()
 
         requests = test_agent.v06_stats_requests()
@@ -185,7 +185,7 @@ def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_libr
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
     @enable_tracestats()
-    def test_measured_spans_TS004(self, library_env, test_agent, test_library, test_server):
+    def test_measured_spans_TS004(self, library_env, test_agent, test_library):
         """
         When spans are marked as measured
             Each has stats computed for it
@@ -227,7 +227,7 @@ def test_measured_spans_TS004(self, library_env, test_agent, test_library, test_
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
     @enable_tracestats()
-    def test_top_level_TS005(self, library_env, test_agent, test_library, test_server):
+    def test_top_level_TS005(self, library_env, test_agent, test_library):
         """
         When top level (service entry) spans are created
             Each top level span has trace stats computed for it.
@@ -277,7 +277,7 @@ def test_top_level_TS005(self, library_env, test_agent, test_library, test_serve
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
     @enable_tracestats()
-    def test_successes_errors_recorded_separately_TS006(self, library_env, test_agent, test_library, test_server):
+    def test_successes_errors_recorded_separately_TS006(self, library_env, test_agent, test_library):
         """
         When spans are marked as errors
             The errors count is incremented appropriately and the stats are aggregated into the ErrorSummary
@@ -333,7 +333,7 @@ def test_successes_errors_recorded_separately_TS006(self, library_env, test_agen
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
     @enable_tracestats(sample_rate=0.0)
-    def test_sample_rate_0_TS007(self, library_env, test_agent, test_library, test_server):
+    def test_sample_rate_0_TS007(self, library_env, test_agent, test_library):
         """
         When the sample rate is 0 and trace stats is enabled
             non-P0 traces should be dropped
@@ -399,7 +399,7 @@ def test_relative_error_TS008(self, library_env, test_agent, test_library):
     @missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
     @missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
     @enable_tracestats()
-    def test_metrics_computed_after_span_finsh_TS009(self, library_env, test_agent, test_library, test_server):
+    def test_metrics_computed_after_span_finsh_TS009(self, library_env, test_agent, test_library):
         """
         When trace stats are computed for traces
             Metrics must be computed after spans are finished, otherwise components of the aggregation key may change after

diff --git a/tests/parametric/test_otel_span_methods.py b/tests/parametric/test_otel_span_methods.py
@@ -907,7 +907,7 @@ def test_otel_add_event_meta_serialization(self, test_agent, test_library):
 
         event2 = events[1]
         assert event2.get("name") == "second_event"
-        assert event2.get("time_unix_nano") // 100000 == event2_timestamp_ns // 100000  # reduce the precision tested
+        assert abs(event2.get("time_unix_nano") - event2_timestamp_ns) < 100000  # reduce the precision tested
         assert event2["attributes"].get("string_val") == "value"
 
         event3 = events[2]