Seldon deployment for build log clustering
Contents
Seldon deployment for build log clustering#
In this notebook, we deploy a seldon service for clustering build logs. First, we take the experiments in build log clustering notebook and train a Sklearn pipeline with all the components. Then, we save the model on s3 storage and deploy a seldon service that uses the saved model. Finally, we test the service for inference on an example request.
import os
import pandas as pd
import requests
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
import boto3
import json
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
True
Load Dataset#
# Note: periodic jobs only (see FIXME in class Builds)
job_name = "periodic-ci-openshift-release-master-ci-4.8-e2e-gcp"
logs_path = "../../../../data/raw/gcs/build-logs/" # local cache of build log files
metadata_path = "../../../../data/raw/gcs/build-metadata/" # path to saved metadata
metadata_file_name = os.path.join(metadata_path, f"{job_name}_build-logs.csv")
def log_path_for(build_id):
return os.path.join(logs_path, f"{build_id}.txt")
def prow_url_for(build_id):
project = "origin-ci-test"
# FIXME: this prefix is only for periodic jobs
job_prefix = f"logs/{job_name}/"
return f"https://prow.ci.openshift.org/view/gcs/{project}/{job_prefix}{build_id}"
def clean_df(df):
"""Polishes the metadata DataFrame"""
build_errors = df[df["result"] == "error"].index
df.drop(build_errors, inplace=True) # Remove builds that erroed (prow error)
df["duration"] = df["end"] - df["start"] # From timestamps to job duration
df["success"] = df["result"] == "SUCCESS" # A boolean version of the result
return df
print("Reading metadata from", metadata_file_name)
df = pd.read_csv(metadata_file_name, index_col=0)
df = clean_df(df)
df
Reading metadata from ../../../../data/raw/gcs/build-metadata/periodic-ci-openshift-release-master-ci-4.8-e2e-gcp_build-logs.csv
result | size | start | end | duration | success | |
---|---|---|---|---|---|---|
1429152444788510720 | SUCCESS | 4135 | 1629571472 | 1629576480 | 5008 | True |
1455624937803878400 | SUCCESS | 4579 | 1635883006 | 1635887416 | 4410 | True |
1445593776872493056 | SUCCESS | 4397 | 1633491392 | 1633496097 | 4705 | True |
1417019048973045760 | SUCCESS | 4134 | 1626678644 | 1626683464 | 4820 | True |
1427589558375026688 | SUCCESS | 4133 | 1629198851 | 1629203489 | 4638 | True |
... | ... | ... | ... | ... | ... | ... |
1464437854917627904 | SUCCESS | 4579 | 1637984169 | 1637988828 | 4659 | True |
1420899046205165568 | SUCCESS | 4131 | 1627603731 | 1627608772 | 5041 | True |
1410375749352820736 | FAILURE | 8880668 | 1625094759 | 1625100569 | 5810 | False |
1422945097544110080 | SUCCESS | 4133 | 1628091552 | 1628096732 | 5180 | True |
1462490101803126784 | SUCCESS | 4581 | 1637519789 | 1637524977 | 5188 | True |
1080 rows × 6 columns
# Get a list of paths to the local copy of each build log
build_logs = []
for build_id in df.index:
with open(log_path_for(build_id), "r") as f:
build_logs.append(f.read())
Train SKlearn Pipeline#
token_pattern = r"\b[a-z][a-z0-9_/\.-]+\b"
vectorizer = TfidfVectorizer(
min_df=0.03,
token_pattern=token_pattern,
)
k = 3
kmeans = KMeans(n_clusters=k, random_state=123)
pipeline = Pipeline([("tfidf", vectorizer), ("kmeans", kmeans)])
pipeline.fit(build_logs)
Pipeline(steps=[('tfidf',
TfidfVectorizer(min_df=0.03,
token_pattern='\\b[a-z][a-z0-9_/\\.-]+\\b')),
('kmeans', KMeans(n_clusters=3, random_state=123))])
Save Pipeline#
joblib.dump(pipeline, "model.joblib")
['model.joblib']
# Test set
test_set = [i for i in build_logs if len(i) < 5000][:25]
# Sanity check to see if the saved model works locally
pipeline_loaded = joblib.load("model2.joblib")
pipeline_loaded
pipeline_loaded.predict(test_set)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 0], dtype=int32)
# Set credentials for your s3 storage
s3_endpoint_url = os.getenv("S3_ENDPOINT")
aws_access_key_id = os.getenv("S3_ACCESS_KEY")
aws_secret_access_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")
s3_resource = boto3.resource(
"s3",
endpoint_url=s3_endpoint_url,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
bucket = s3_resource.Bucket(name=s3_bucket)
# Upload your model
bucket.upload_file(
"model.joblib", "ai4ci/build-log-clustering/tfidf-kmeans/model.joblib"
)
# Check if your model exists on s3
objects = [
obj.key for obj in bucket.objects.filter(Prefix="") if "model.joblib" in obj.key
]
objects
['ai4ci/build-log-clustering/tfidf-kmeans/model.joblib',
'ai4ci/failure-classifier/model.joblib',
'ai4ci/github-pr-ttm/model/model.joblib',
'github/ttm-model-raw-data/pipeline/model.joblib',
'github/ttm-model/model.joblib',
'github/ttm-model/pipeline/model.joblib']
Test seldon deployment service#
We use the deployment config to deploy a seldon service.
# Service url
base_url = "http://build-log-clustering-ds-ml-workflows-ws.apps.smaug.na.operate-first.cloud/predict"
# convert the dataframe into a numpy array and then to a list (required by seldon)
data = {"data": {"ndarray": test_set}}
# create the query payload
json_data = json.dumps(data)
headers = {"content-Type": "application/json"}
# query our inference service
response = requests.post(base_url, data=json_data, headers=headers)
response
<Response [200]>
response.json()
{'data': {'names': [],
'ndarray': [0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
2,
0,
0,
0,
0,
0,
0,
0,
0]},
'meta': {'requestPath': {'classifier': 'registry.connect.redhat.com/seldonio/sklearnserver@sha256:88d126455b150291cbb3772f67b4f35a88bb54b15ff7c879022f77fb051615ad'}}}
Conclusion#
In this notebook, we saw how to create and save an unsupervised model for clustering build logs. We successfully deployed and tested the model using s3 for storage and a seldon service on Openshift.