Contents

Validate the succesful running of the Automated Pipeline

Contents

Validate the succesful running of the Automated Pipeline#

Successful running of a pipeline should collected raw data and metrics in the project S3 bucket. This notebook is a simple utlity notebook to check the contents of the S3 bucket and validate that the automated pipeline ran succesfully.

import os
import datetime
from dotenv import load_dotenv, find_dotenv
from ipynb.fs.defs.metric_template import CephCommunication

load_dotenv(find_dotenv())

True

s3_endpoint_url = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")

s3_bucket

'opf-datacatalog'

cc = CephCommunication(s3_endpoint_url, s3_access_key, s3_secret_key, s3_bucket)

bucket = cc.s3_resource.Bucket(s3_bucket)

# check all .parquet files in S3
objects = [i for i in bucket.objects.all() if "parquet" in i.key]
sorted(objects, key=lambda x: x.key)

[s3.ObjectSummary(bucket_name='opf-datacatalog', key='None/consec_fail_stats_df.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='None/pct_fixed_per_ts.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/avg_correlation/avg_correlation-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/avg_correlation/avg_correlation-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/avg_correlation/avg_correlation-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/avg_correlation/avg_correlation-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/avg_correlation/avg_correlation-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/blocked_timed_out/blocked_timed_out-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/blocked_timed_out/blocked_timed_out-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/blocked_timed_out/blocked_timed_out-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/blocked_timed_out/blocked_timed_out-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/blocked_timed_out/blocked_timed_out-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/build_pass_failure/build_pass_failure-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/build_pass_failure/build_pass_failure-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/build_pass_failure/build_pass_failure-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/build_pass_failure/build_pass_failure-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/correlation/correlation-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/correlation/correlation-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/correlation/correlation-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/correlation/correlation-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/correlation/correlation-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/flake.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/number_of_flakes/number_of_flakes-2021-5-4.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/pct_fixed_each_ts/pct_fixed_each_ts-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/pct_fixed_each_ts/pct_fixed_each_ts-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/pct_fixed_each_ts/pct_fixed_each_ts-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/pct_fixed_each_ts/pct_fixed_each_ts-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/pct_fixed_each_ts/pct_fixed_each_ts-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-28.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/persistent_failures/persistent_failures-2021-5-4.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/test_pass_failures/test_pass_failures-2021-5-3.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-4-14.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-4-16.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-4-19.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-4-27.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-4-29.parquet'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='metrics/time_to_test/time_to_test-2021-5-3.parquet')]

# check all raw data files in s3
objects = [i for i in bucket.objects.all() if "raw_data/" in i.key]
sorted(objects, key=lambda x: x.key)

[s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_144.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_164.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_194.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_274.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_284.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_294.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_35.json'),
 s3.ObjectSummary(bucket_name='opf-datacatalog', key='raw_data/testgrid_45.json')]

Check for today’s data on S3#

# modify variables for custom date
timestamp = datetime.datetime.today()

metric_name = f"{timestamp.year}-{timestamp.month}-{timestamp.day}.parquet"
raw_data = f"testgrid_{timestamp.day}{timestamp.month}.json"

metric_objects = [i for i in bucket.objects.all() if metric_name in i.key]
sorted(metric_objects, key=lambda x: x.key)

[]

raw_data_objects = [i for i in bucket.objects.all() if raw_data in i.key]
sorted(raw_data_objects, key=lambda x: x.key)

[]