Seldon deployment for build log clustering#

In this notebook, we deploy a seldon service for clustering build logs. First, we take the experiments in build log clustering notebook and train a Sklearn pipeline with all the components. Then, we save the model on s3 storage and deploy a seldon service that uses the saved model. Finally, we test the service for inference on an example request.

import os
import pandas as pd
import requests
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
import boto3
import json
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
True

Load Dataset#

# Note: periodic jobs only (see FIXME in class Builds)
job_name = "periodic-ci-openshift-release-master-ci-4.8-e2e-gcp"

logs_path = "../../../../data/raw/gcs/build-logs/"  # local cache of build log files
metadata_path = "../../../../data/raw/gcs/build-metadata/"  # path to saved metadata
metadata_file_name = os.path.join(metadata_path, f"{job_name}_build-logs.csv")


def log_path_for(build_id):
    return os.path.join(logs_path, f"{build_id}.txt")


def prow_url_for(build_id):
    project = "origin-ci-test"
    # FIXME: this prefix is only for periodic jobs
    job_prefix = f"logs/{job_name}/"
    return f"https://prow.ci.openshift.org/view/gcs/{project}/{job_prefix}{build_id}"


def clean_df(df):
    """Polishes the metadata DataFrame"""
    build_errors = df[df["result"] == "error"].index
    df.drop(build_errors, inplace=True)  # Remove builds that erroed (prow error)
    df["duration"] = df["end"] - df["start"]  # From timestamps to job duration
    df["success"] = df["result"] == "SUCCESS"  # A boolean version of the result
    return df


print("Reading metadata from", metadata_file_name)
df = pd.read_csv(metadata_file_name, index_col=0)
df = clean_df(df)
df
Reading metadata from ../../../../data/raw/gcs/build-metadata/periodic-ci-openshift-release-master-ci-4.8-e2e-gcp_build-logs.csv
result size start end duration success
1429152444788510720 SUCCESS 4135 1629571472 1629576480 5008 True
1455624937803878400 SUCCESS 4579 1635883006 1635887416 4410 True
1445593776872493056 SUCCESS 4397 1633491392 1633496097 4705 True
1417019048973045760 SUCCESS 4134 1626678644 1626683464 4820 True
1427589558375026688 SUCCESS 4133 1629198851 1629203489 4638 True
... ... ... ... ... ... ...
1464437854917627904 SUCCESS 4579 1637984169 1637988828 4659 True
1420899046205165568 SUCCESS 4131 1627603731 1627608772 5041 True
1410375749352820736 FAILURE 8880668 1625094759 1625100569 5810 False
1422945097544110080 SUCCESS 4133 1628091552 1628096732 5180 True
1462490101803126784 SUCCESS 4581 1637519789 1637524977 5188 True

1080 rows × 6 columns

# Get a list of paths to the local copy of each build log
build_logs = []
for build_id in df.index:
    with open(log_path_for(build_id), "r") as f:
        build_logs.append(f.read())

Train SKlearn Pipeline#

token_pattern = r"\b[a-z][a-z0-9_/\.-]+\b"
vectorizer = TfidfVectorizer(
    min_df=0.03,
    token_pattern=token_pattern,
)

k = 3
kmeans = KMeans(n_clusters=k, random_state=123)

pipeline = Pipeline([("tfidf", vectorizer), ("kmeans", kmeans)])
pipeline.fit(build_logs)
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(min_df=0.03,
                                 token_pattern='\\b[a-z][a-z0-9_/\\.-]+\\b')),
                ('kmeans', KMeans(n_clusters=3, random_state=123))])

Save Pipeline#

joblib.dump(pipeline, "model.joblib")
['model.joblib']
# Test set
test_set = [i for i in build_logs if len(i) < 5000][:25]
# Sanity check to see if the saved model works locally
pipeline_loaded = joblib.load("model2.joblib")
pipeline_loaded
pipeline_loaded.predict(test_set)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)
# Set credentials for your s3 storage
s3_endpoint_url = os.getenv("S3_ENDPOINT")
aws_access_key_id = os.getenv("S3_ACCESS_KEY")
aws_secret_access_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")
s3_resource = boto3.resource(
    "s3",
    endpoint_url=s3_endpoint_url,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)
bucket = s3_resource.Bucket(name=s3_bucket)
# Upload your model
bucket.upload_file(
    "model.joblib", "ai4ci/build-log-clustering/tfidf-kmeans/model.joblib"
)

# Check if your model exists on s3
objects = [
    obj.key for obj in bucket.objects.filter(Prefix="") if "model.joblib" in obj.key
]
objects
['ai4ci/build-log-clustering/tfidf-kmeans/model.joblib',
 'ai4ci/failure-classifier/model.joblib',
 'ai4ci/github-pr-ttm/model/model.joblib',
 'github/ttm-model-raw-data/pipeline/model.joblib',
 'github/ttm-model/model.joblib',
 'github/ttm-model/pipeline/model.joblib']

Test seldon deployment service#

We use the deployment config to deploy a seldon service.

# Service url
base_url = "http://build-log-clustering-ds-ml-workflows-ws.apps.smaug.na.operate-first.cloud/predict"
# convert the dataframe into a numpy array and then to a list (required by seldon)
data = {"data": {"ndarray": test_set}}

# create the query payload
json_data = json.dumps(data)
headers = {"content-Type": "application/json"}

# query our inference service
response = requests.post(base_url, data=json_data, headers=headers)
response
<Response [200]>
response.json()
{'data': {'names': [],
  'ndarray': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]},
 'meta': {'requestPath': {'classifier': 'registry.connect.redhat.com/seldonio/sklearnserver@sha256:88d126455b150291cbb3772f67b4f35a88bb54b15ff7c879022f77fb051615ad'}}}

Conclusion#

In this notebook, we saw how to create and save an unsupervised model for clustering build logs. We successfully deployed and tested the model using s3 for storage and a seldon service on Openshift.