Skip to content

Commit

Permalink
Fix parametric instability at container start (#3359)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbeauchesne authored Nov 4, 2024
1 parent 41807c2 commit ad46fba
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 133 deletions.
6 changes: 0 additions & 6 deletions docs/scenarios/parametric.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,6 @@ library. Deleting the image will force a rebuild which will resolve the issue.
docker image rm <library>-test-library
```

### Port conflict on 50052

If there is a port conflict with an existing process on the local machine then the default port `50052` can be
overridden using `APM_LIBRARY_SERVER_PORT`.


### Disable build kit

If logs like
Expand Down
101 changes: 29 additions & 72 deletions tests/parametric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Dict, Generator, List, TextIO, TypedDict
import urllib.parse

from docker.models.containers import Container
import requests
import pytest

Expand Down Expand Up @@ -387,44 +386,6 @@ def wait_for_tracer_flare(self, case_id: str = None, clear: bool = False, wait_l
raise AssertionError("No tracer-flare received")


@contextlib.contextmanager
def docker_run(
image: str,
name: str,
cmd: List[str],
env: Dict[str, str],
volumes: Dict[str, str],
host_port: int,
container_port: int,
log_file: TextIO,
network_name: str,
) -> Generator[Container, None, None]:

# Run the docker container
logger.info(f"Starting {name}")

container = scenarios.parametric.docker_run(
image,
name=name,
env=env,
volumes=volumes,
network=network_name,
host_port=host_port,
container_port=container_port,
command=cmd,
)

try:
yield container
finally:
logger.info(f"Stopping {name}")
container.stop(timeout=1)
logs = container.logs()
log_file.write(logs.decode("utf-8"))
log_file.flush()
container.remove(force=True)


@pytest.fixture(scope="session")
def docker() -> str:
"""Fixture to ensure docker is ready to use on the system."""
Expand Down Expand Up @@ -515,28 +476,27 @@ def test_agent(
# (trace_content_length) go client doesn't submit content length header
env["ENABLED_CHECKS"] = "trace_count_header"

host_port = scenarios.parametric.get_host_port(worker_id, 50000)
host_port = scenarios.parametric.get_host_port(worker_id, 4600)

with docker_run(
with scenarios.parametric.docker_run(
image=scenarios.parametric.TEST_AGENT_IMAGE,
name=test_agent_container_name,
cmd=[],
command=[],
env=env,
volumes={f"{os.getcwd()}/snapshots": "/snapshots"},
host_port=host_port,
container_port=test_agent_port,
log_file=test_agent_log_file,
network_name=docker_network,
network=docker_network,
):
logger.debug(f"Test agent {test_agent_container_name} started on host port {host_port}")

client = _TestAgentAPI(base_url=f"http://localhost:{host_port}", pytest_request=request)
time.sleep(0.2) # intial wait time, the trace agent takes 200ms to start
for _ in range(100):
try:
resp = client.info()
except:
logger.debug("Wait for 0.1s for the test agent to be ready")
except Exception as e:
logger.debug(f"Wait for 0.1s for the test agent to be ready {e}")
time.sleep(0.1)
else:
if resp["version"] != "test":
Expand All @@ -547,8 +507,7 @@ def test_agent(
logger.info("Test agent is ready")
break
else:
with open(test_agent_log_file.name) as f:
logger.error(f"Could not connect to test agent: {f.read()}")
logger.error("Could not connect to test agent")
pytest.fail(
f"Could not connect to test agent, check the log file {test_agent_log_file.name}.", pytrace=False
)
Expand All @@ -568,14 +527,15 @@ def test_agent(


@pytest.fixture
def test_server(
def test_library(
worker_id: str,
docker_network: str,
test_agent_port: str,
test_agent_container_name: str,
apm_test_server: APMLibraryTestServer,
test_server_log_file: TextIO,
):
) -> Generator[APMLibrary, None, None]:

env = {
"DD_TRACE_DEBUG": "true",
"DD_TRACE_AGENT_URL": f"http://{test_agent_container_name}:{test_agent_port}",
Expand All @@ -590,38 +550,35 @@ def test_server(
test_server_env[k] = v
env.update(test_server_env)

apm_test_server.host_port = scenarios.parametric.get_host_port(worker_id, 51000)
apm_test_server.host_port = scenarios.parametric.get_host_port(worker_id, 4500)

with docker_run(
with scenarios.parametric.docker_run(
image=apm_test_server.container_tag,
name=apm_test_server.container_name,
cmd=apm_test_server.container_cmd,
command=apm_test_server.container_cmd,
env=env,
host_port=apm_test_server.host_port,
container_port=apm_test_server.container_port,
volumes=apm_test_server.volumes,
log_file=test_server_log_file,
network_name=docker_network,
network=docker_network,
) as container:
logger.debug(f"Test server {apm_test_server.container_name} started on host port {apm_test_server.host_port}")
apm_test_server.container = container
yield apm_test_server

test_server_timeout = 60

@pytest.fixture
def test_library(test_server: APMLibraryTestServer) -> Generator[APMLibrary, None, None]:
test_server_timeout = 60

if test_server.host_port is None:
raise RuntimeError("Internal error, no port has been assigned", 1)
if apm_test_server.host_port is None:
raise RuntimeError("Internal error, no port has been assigned", 1)

if test_server.protocol == "grpc":
client = APMLibraryClientGRPC(f"localhost:{test_server.host_port}", test_server_timeout, test_server.container)
elif test_server.protocol == "http":
client = APMLibraryClientHTTP(
f"http://localhost:{test_server.host_port}", test_server_timeout, test_server.container
)
else:
raise ValueError(f"Interface {test_server.protocol} not supported")
tracer = APMLibrary(client, test_server.lang)
yield tracer
if apm_test_server.protocol == "grpc":
client = APMLibraryClientGRPC(
f"localhost:{apm_test_server.host_port}", test_server_timeout, apm_test_server.container
)
elif apm_test_server.protocol == "http":
client = APMLibraryClientHTTP(
f"http://localhost:{apm_test_server.host_port}", test_server_timeout, apm_test_server.container
)
else:
raise ValueError(f"Interface {apm_test_server.protocol} not supported")
tracer = APMLibrary(client, apm_test_server.lang)
yield tracer
1 change: 1 addition & 0 deletions tests/parametric/test_dynamic_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,7 @@ def test_trace_sampling_rules_with_tags(self, test_agent, test_library):
@bug(library="cpp", reason="unknown")
@bug(library="ruby", reason="To be investigated")
@bug(context.library > "[email protected]", reason="APMAPI-833")
@bug(library="python", reason="APMAPI-857")
@parametrize("library_env", [{**DEFAULT_ENVVARS}])
def test_remote_sampling_rules_retention(self, library_env, test_agent, test_library):
"""Only the last set of sampling rules should be applied"""
Expand Down
14 changes: 7 additions & 7 deletions tests/parametric/test_library_tracestats.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_metrics_msgpack_serialization_TS001(self, library_env, test_agent, test
@missing_feature(context.library == "nodejs", reason="nodejs has not implemented stats computation yet")
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_library, test_server):
def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_library):
"""
When spans are created with a unique set of dimensions
Each span has stats computed for it and is in its own bucket
Expand Down Expand Up @@ -160,7 +160,7 @@ def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_libr
) as span:
span.set_meta(key="http.status_code", val="400")

if test_server.lang == "golang":
if test_library.lang == "golang":
test_library.flush()

requests = test_agent.v06_stats_requests()
Expand All @@ -185,7 +185,7 @@ def test_distinct_aggregationkeys_TS003(self, library_env, test_agent, test_libr
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
@enable_tracestats()
def test_measured_spans_TS004(self, library_env, test_agent, test_library, test_server):
def test_measured_spans_TS004(self, library_env, test_agent, test_library):
"""
When spans are marked as measured
Each has stats computed for it
Expand Down Expand Up @@ -227,7 +227,7 @@ def test_measured_spans_TS004(self, library_env, test_agent, test_library, test_
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
@enable_tracestats()
def test_top_level_TS005(self, library_env, test_agent, test_library, test_server):
def test_top_level_TS005(self, library_env, test_agent, test_library):
"""
When top level (service entry) spans are created
Each top level span has trace stats computed for it.
Expand Down Expand Up @@ -277,7 +277,7 @@ def test_top_level_TS005(self, library_env, test_agent, test_library, test_serve
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
@enable_tracestats()
def test_successes_errors_recorded_separately_TS006(self, library_env, test_agent, test_library, test_server):
def test_successes_errors_recorded_separately_TS006(self, library_env, test_agent, test_library):
"""
When spans are marked as errors
The errors count is incremented appropriately and the stats are aggregated into the ErrorSummary
Expand Down Expand Up @@ -333,7 +333,7 @@ def test_successes_errors_recorded_separately_TS006(self, library_env, test_agen
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
@enable_tracestats(sample_rate=0.0)
def test_sample_rate_0_TS007(self, library_env, test_agent, test_library, test_server):
def test_sample_rate_0_TS007(self, library_env, test_agent, test_library):
"""
When the sample rate is 0 and trace stats is enabled
non-P0 traces should be dropped
Expand Down Expand Up @@ -399,7 +399,7 @@ def test_relative_error_TS008(self, library_env, test_agent, test_library):
@missing_feature(context.library == "php", reason="php has not implemented stats computation yet")
@missing_feature(context.library == "ruby", reason="ruby has not implemented stats computation yet")
@enable_tracestats()
def test_metrics_computed_after_span_finsh_TS009(self, library_env, test_agent, test_library, test_server):
def test_metrics_computed_after_span_finsh_TS009(self, library_env, test_agent, test_library):
"""
When trace stats are computed for traces
Metrics must be computed after spans are finished, otherwise components of the aggregation key may change after
Expand Down
2 changes: 1 addition & 1 deletion tests/parametric/test_otel_span_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@ def test_otel_add_event_meta_serialization(self, test_agent, test_library):

event2 = events[1]
assert event2.get("name") == "second_event"
assert event2.get("time_unix_nano") // 100000 == event2_timestamp_ns // 100000 # reduce the precision tested
assert abs(event2.get("time_unix_nano") - event2_timestamp_ns) < 100000 # reduce the precision tested
assert event2["attributes"].get("string_val") == "value"

event3 = events[2]
Expand Down
Loading

0 comments on commit ad46fba

Please sign in to comment.