Improvements related to deployment (#69)

hellais · web-flow · commit 74a9b76dd72b · 2024-05-02T21:28:56.000+02:00
* Add exponential backoff and retries to query execution
* Fix handling of nested OTel spans
* Bump up the max runtime of activities
* Improve documentation
diff --git a/oonipipeline/Readme.md b/oonipipeline/Readme.md
@@ -21,6 +21,10 @@ In order to run the pipeline you should setup the following dependencies:
 
 ### Quick start
 
+```
+git clone https://github.com/ooni/data
+```
+
 Start temporal dev server:
 
 ```
@@ -38,7 +42,7 @@ clickhouse server
 You can then start the desired workflow, for example to create signal observations for the US:
 
 ```
-hatch run oonipipeline mkobs --probe-cc US --test-name signal --start-day 2024-01-01 --end-day 2024-01-02
+hatch run oonipipeline mkobs --probe-cc US --test-name signal --start-day 2024-01-01 --end-day 2024-01-02 --create-tables
 ```
 
 Monitor the workflow executing by accessing: http://localhost:8233/
@@ -92,7 +96,7 @@ hatch run oonipipeline startworkers
 Then you can trigger the workflow by passing the `--no-start-workers` flag:
 
 ```
-hatch run oonipipeline mkobs --probe-cc US --start-day 2024-01-01 --end-day 2024-01-20 --no-start-workers
+hatch run oonipipeline mkobs --probe-cc US --start-day 2024-01-01 --end-day 2024-01-20 --no-start-workers --create-tables
 ```
 
 #### Superset
diff --git a/oonipipeline/src/oonipipeline/__about__.py b/oonipipeline/src/oonipipeline/__about__.py
@@ -1 +1 @@
-VERSION = "5.0.0a0"
+VERSION = "5.0.0a1"
diff --git a/oonipipeline/src/oonipipeline/db/connections.py b/oonipipeline/src/oonipipeline/db/connections.py
@@ -1,5 +1,6 @@
 import csv
 import pickle
+import random
 import time
 
 from collections import defaultdict, namedtuple
@@ -32,7 +33,9 @@ def __init__(
         conn_url,
         row_buffer_size=0,
         max_block_size=1_000_000,
-        dump_failing_rows: Optional[str] = None,
+        max_retries=3,
+        backoff_factor=1,
+        max_backoff=32,
     ):
         from clickhouse_driver import Client
 
@@ -44,7 +47,10 @@ def __init__(
 
         self._column_names = {}
         self._row_buffer = defaultdict(list)
-        self.dump_failing_rows = dump_failing_rows
+
+        self._max_retries = max_retries
+        self._max_backoff = max_backoff
+        self._backoff_factor = backoff_factor
 
     def __enter__(self):
         return self
@@ -57,9 +63,30 @@ def delete_sync(self, table_name: str, where: str):
         self.execute("SET mutations_sync = 1;")
         return self.execute(f"DELETE FROM {table_name} WHERE {where};")
 
-    def execute(self, *args, **kwargs):
+    def _execute(self, *args, **kwargs):
         return self.client.execute(*args, **kwargs)
 
+    def execute(self, query_str, *args, **kwargs):
+        exception_list = []
+        # Exponentially backoff the retries
+        for attempt in range(self._max_retries):
+            try:
+                return self._execute(query_str, *args, **kwargs)
+            except Exception as e:
+                exception_list.append(e)
+                sleep_time = min(self._max_backoff, self._backoff_factor * (2**attempt))
+                log.error(
+                    f"failed to execute {query_str} args[{len(args)}] kwargs[{len(kwargs)}] (attempt {attempt})"
+                )
+                log.error(e)
+                log.error("exception history")
+                for exc in exception_list[:-1]:
+                    log.error(exc)
+                sleep_time += random.uniform(0, sleep_time * 0.1)
+                time.sleep(sleep_time)
+        # Raise the last exception
+        raise exception_list[-1]
+
     def execute_iter(self, *args, **kwargs):
         return self.client.execute_iter(
             *args, **kwargs, settings={"max_block_size": self.max_block_size}
@@ -82,27 +109,7 @@ def write_rows(self, table_name, rows, column_names):
     def flush_rows(self, table_name, rows):
         fields_str = ", ".join(self._column_names[table_name])
         query_str = f"INSERT INTO {table_name} ({fields_str}) VALUES"
-        try:
-            self.execute(query_str, rows)
-        except Exception as exc:
-            log.error(
-                f"Failed to write {len(rows)} rows. Trying to savage what is savageable. ({exc})"
-            )
-            for idx, row in enumerate(rows):
-                try:
-                    self.execute(
-                        query_str,
-                        [row],
-                        types_check=True,
-                        query_id=f"oonidata-savage-{idx}-{time.time()}",
-                    )
-                    time.sleep(0.1)
-                except Exception as exc:
-                    log.error(f"Failed to write {row} ({exc}) {query_str}")
-
-                    if self.dump_failing_rows:
-                        with open(self.dump_failing_rows, "ab") as out_file:
-                            pickle.dump({"query_str": query_str, "row": row}, out_file)
+        self.execute(query_str, rows)
 
     def flush_all_rows(self):
         for table_name, rows in self._row_buffer.items():
diff --git a/oonipipeline/src/oonipipeline/temporal/activities/analysis.py b/oonipipeline/src/oonipipeline/temporal/activities/analysis.py
@@ -107,132 +107,119 @@ def make_analysis_in_a_day(params: MakeAnalysisParams) -> dict:
 
     tracer = opentelemetry.trace.get_tracer(__name__)
 
-    with opentelemetry.trace.get_current_span():
-        fingerprintdb = FingerprintDB(datadir=data_dir, download=False)
-        body_db = BodyDB(db=ClickhouseConnection(clickhouse))
-        db_writer = ClickhouseConnection(clickhouse, row_buffer_size=10_000)
-        db_lookup = ClickhouseConnection(clickhouse)
-
-        column_names_wa = [f.name for f in dataclasses.fields(WebAnalysis)]
-        column_names_er = [
-            f.name for f in dataclasses.fields(MeasurementExperimentResult)
-        ]
-
-        # TODO(art): this previous range search and deletion makes the idempotence
-        # of the activity not 100% accurate.
-        # We should look into fixing it.
-        prev_range_list = [
-            get_prev_range(
-                db=db_lookup,
-                table_name=WebAnalysis.__table_name__,
-                timestamp=datetime.combine(day, datetime.min.time()),
-                test_name=[],
-                probe_cc=probe_cc,
-                timestamp_column="measurement_start_time",
-            ),
-            get_prev_range(
-                db=db_lookup,
-                table_name=MeasurementExperimentResult.__table_name__,
-                timestamp=datetime.combine(day, datetime.min.time()),
-                test_name=[],
-                probe_cc=probe_cc,
-                timestamp_column="timeofday",
-                probe_cc_column="location_network_cc",
-            ),
-        ]
-
-        log.info(f"loading ground truth DB for {day}")
-        with tracer.start_as_current_span(
-            "MakeObservations:load_ground_truths"
-        ) as span:
-            ground_truth_db_path = (
-                data_dir / "ground_truths" / f"web-{day.strftime('%Y-%m-%d')}.sqlite3"
-            )
-            web_ground_truth_db = WebGroundTruthDB()
-            web_ground_truth_db.build_from_existing(
-                str(ground_truth_db_path.absolute())
-            )
-            log.info(f"loaded ground truth DB for {day}")
-            span.add_event(f"loaded ground truth DB for {day}")
-            span.set_attribute("day", day.strftime("%Y-%m-%d"))
-            span.set_attribute(
-                "ground_truth_row_count", web_ground_truth_db.count_rows()
-            )
+    fingerprintdb = FingerprintDB(datadir=data_dir, download=False)
+    body_db = BodyDB(db=ClickhouseConnection(clickhouse))
+    db_writer = ClickhouseConnection(clickhouse, row_buffer_size=10_000)
+    db_lookup = ClickhouseConnection(clickhouse)
+
+    column_names_wa = [f.name for f in dataclasses.fields(WebAnalysis)]
+    column_names_er = [f.name for f in dataclasses.fields(MeasurementExperimentResult)]
+
+    # TODO(art): this previous range search and deletion makes the idempotence
+    # of the activity not 100% accurate.
+    # We should look into fixing it.
+    prev_range_list = [
+        get_prev_range(
+            db=db_lookup,
+            table_name=WebAnalysis.__table_name__,
+            timestamp=datetime.combine(day, datetime.min.time()),
+            test_name=[],
+            probe_cc=probe_cc,
+            timestamp_column="measurement_start_time",
+        ),
+        get_prev_range(
+            db=db_lookup,
+            table_name=MeasurementExperimentResult.__table_name__,
+            timestamp=datetime.combine(day, datetime.min.time()),
+            test_name=[],
+            probe_cc=probe_cc,
+            timestamp_column="timeofday",
+            probe_cc_column="location_network_cc",
+        ),
+    ]
+
+    log.info(f"loading ground truth DB for {day}")
+    with tracer.start_span("MakeObservations:load_ground_truths") as span:
+        ground_truth_db_path = (
+            data_dir / "ground_truths" / f"web-{day.strftime('%Y-%m-%d')}.sqlite3"
+        )
+        web_ground_truth_db = WebGroundTruthDB()
+        web_ground_truth_db.build_from_existing(str(ground_truth_db_path.absolute()))
+        log.info(f"loaded ground truth DB for {day}")
+        span.add_event(f"loaded ground truth DB for {day}")
+        span.set_attribute("day", day.strftime("%Y-%m-%d"))
+        span.set_attribute("ground_truth_row_count", web_ground_truth_db.count_rows())
+
+    failures = 0
+    no_exp_results = 0
+    observation_count = 0
+    with tracer.start_span("MakeObservations:iter_web_observations") as span:
+        for web_obs in iter_web_observations(
+            db_lookup,
+            measurement_day=day,
+            probe_cc=probe_cc,
+            test_name="web_connectivity",
+        ):
+            try:
+                relevant_gts = web_ground_truth_db.lookup_by_web_obs(web_obs=web_obs)
+            except:
+                log.error(
+                    f"failed to lookup relevant_gts for {web_obs[0].measurement_uid}",
+                    exc_info=True,
+                )
+                failures += 1
+                continue
 
-        failures = 0
-        no_exp_results = 0
-        observation_count = 0
-        with tracer.start_as_current_span(
-            "MakeObservations:iter_web_observations"
-        ) as span:
-            for web_obs in iter_web_observations(
-                db_lookup,
-                measurement_day=day,
-                probe_cc=probe_cc,
-                test_name="web_connectivity",
-            ):
-                try:
-                    relevant_gts = web_ground_truth_db.lookup_by_web_obs(
-                        web_obs=web_obs
-                    )
-                except:
-                    log.error(
-                        f"failed to lookup relevant_gts for {web_obs[0].measurement_uid}",
-                        exc_info=True,
+            try:
+                website_analysis = list(
+                    make_web_analysis(
+                        web_observations=web_obs,
+                        body_db=body_db,
+                        web_ground_truths=relevant_gts,
+                        fingerprintdb=fingerprintdb,
                     )
-                    failures += 1
+                )
+                if len(website_analysis) == 0:
+                    log.info(f"no website analysis for {probe_cc}, {test_name}")
+                    no_exp_results += 1
                     continue
 
-                try:
-                    website_analysis = list(
-                        make_web_analysis(
-                            web_observations=web_obs,
-                            body_db=body_db,
-                            web_ground_truths=relevant_gts,
-                            fingerprintdb=fingerprintdb,
-                        )
-                    )
-                    if len(website_analysis) == 0:
-                        log.info(f"no website analysis for {probe_cc}, {test_name}")
-                        no_exp_results += 1
-                        continue
-
-                    observation_count += 1
-                    table_name, rows = make_db_rows(
-                        dc_list=website_analysis, column_names=column_names_wa
-                    )
-
-                    db_writer.write_rows(
-                        table_name=table_name,
-                        rows=rows,
-                        column_names=column_names_wa,
-                    )
-
-                    website_er = list(make_website_experiment_results(website_analysis))
-                    table_name, rows = make_db_rows(
-                        dc_list=website_er,
-                        column_names=column_names_er,
-                        custom_remap={"loni_list": orjson.dumps},
-                    )
-
-                    db_writer.write_rows(
-                        table_name=table_name,
-                        rows=rows,
-                        column_names=column_names_er,
-                    )
-
-                except:
-                    web_obs_ids = ",".join(map(lambda wo: wo.observation_id, web_obs))
-                    log.error(
-                        f"failed to generate analysis for {web_obs_ids}", exc_info=True
-                    )
-                    failures += 1
-
-            span.set_attribute("total_failure_count", failures)
-            span.set_attribute("total_observation_count", observation_count)
-            span.set_attribute("no_experiment_results_count", no_exp_results)
-            span.set_attribute("day", day.strftime("%Y-%m-%d"))
-            span.set_attribute("probe_cc", probe_cc)
+                observation_count += 1
+                table_name, rows = make_db_rows(
+                    dc_list=website_analysis, column_names=column_names_wa
+                )
+
+                db_writer.write_rows(
+                    table_name=table_name,
+                    rows=rows,
+                    column_names=column_names_wa,
+                )
+
+                website_er = list(make_website_experiment_results(website_analysis))
+                table_name, rows = make_db_rows(
+                    dc_list=website_er,
+                    column_names=column_names_er,
+                    custom_remap={"loni_list": orjson.dumps},
+                )
+
+                db_writer.write_rows(
+                    table_name=table_name,
+                    rows=rows,
+                    column_names=column_names_er,
+                )
+
+            except:
+                web_obs_ids = ",".join(map(lambda wo: wo.observation_id, web_obs))
+                log.error(
+                    f"failed to generate analysis for {web_obs_ids}", exc_info=True
+                )
+                failures += 1
+
+        span.set_attribute("total_failure_count", failures)
+        span.set_attribute("total_observation_count", observation_count)
+        span.set_attribute("no_experiment_results_count", no_exp_results)
+        span.set_attribute("day", day.strftime("%Y-%m-%d"))
+        span.set_attribute("probe_cc", probe_cc)
 
     for prev_range in prev_range_list:
         maybe_delete_prev_range(db=db_lookup, prev_range=prev_range)
diff --git a/oonipipeline/src/oonipipeline/temporal/activities/observations.py b/oonipipeline/src/oonipipeline/temporal/activities/observations.py
@@ -75,17 +75,13 @@ def make_observations_for_file_entry_batch(
 
     total_failure_count = 0
     current_span = trace.get_current_span()
-    with current_span, ClickhouseConnection(
-        clickhouse, row_buffer_size=row_buffer_size
-    ) as db:
+    with ClickhouseConnection(clickhouse, row_buffer_size=row_buffer_size) as db:
         ccs = ccs_set(probe_cc)
         idx = 0
         for bucket_name, s3path, ext, fe_size in file_entry_batch:
             failure_count = 0
             # Nest the traced span within the current span
-            with tracer.start_as_current_span(
-                "MakeObservations:stream_file_entry"
-            ) as span:
+            with tracer.start_span("MakeObservations:stream_file_entry") as span:
                 log.debug(f"processing file s3://{bucket_name}/{s3path}")
                 t = PerfTimer()
                 try:
diff --git a/oonipipeline/src/oonipipeline/temporal/workflows.py b/oonipipeline/src/oonipipeline/temporal/workflows.py
@@ -59,7 +59,8 @@
 TASK_QUEUE_NAME = "oonipipeline-task-queue"
 OBSERVATION_WORKFLOW_ID = "oonipipeline-observations"
 
-MAKE_OBSERVATIONS_START_TO_CLOSE_TIMEOUT = timedelta(hours=24)
+# TODO(art): come up with a nicer way to nest workflows so we don't need such a high global timeout
+MAKE_OBSERVATIONS_START_TO_CLOSE_TIMEOUT = timedelta(hours=48)
 MAKE_GROUND_TRUTHS_START_TO_CLOSE_TIMEOUT = timedelta(hours=1)
 MAKE_ANALYSIS_START_TO_CLOSE_TIMEOUT = timedelta(hours=10)
 
diff --git a/oonipipeline/tests/test_db.py b/oonipipeline/tests/test_db.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-VERSION = "5.0.0a0"`
	`1`	`+VERSION = "5.0.0a1"`