ML pipeline

# Get the latest lib from Rapidcanvas
# !pip install --extra-index-url=https://us-central1-python.pkg.dev/rapidcanvas-361003/pypi/simple utils==0.12dev0

from utils.rc.client.requests import Requests
from utils.rc.client.auth import AuthClient

from utils.rc.dtos.env import Env
from utils.rc.dtos.env import EnvType
from utils.rc.dtos.project import Project
from utils.rc.dtos.dataset import Dataset
from utils.rc.dtos.recipe import Recipe
from utils.rc.dtos.transform import Transform
from utils.rc.dtos.artifact import Artifact
from utils.rc.dtos.dataSource import DataSource
from utils.rc.dtos.dataSource import DataSourceType
from utils.rc.dtos.dataSource import RedisStorageConfig
from utils.rc.dtos.prediction_service import PredictionService
from utils.dtos.rc_prediction_service import RCPredictionService

from utils.rc.dtos.template_v2 import TemplateV2, TemplateTransformV2

import pandas as pd
import logging
from utils.utils.log_util import LogUtil
LogUtil.set_basic_config(format='%(levelname)s:%(message)s', level=logging.INFO)
# Requests.setRootHost("https://test.dev.rapidcanvas.net/api/")
# AuthClient.setToken(email='', password='')
AuthClient.setToken()
env = Env.createEnv(name="ml_pipeline_env", description="ml_pipeline_env", envType=EnvType.LARGE, requirements="")

Creating Project

project = Project.create(
    name="Example ML Pipeline",
    description="Testing python lib",
    createEmpty=True,
    envId=env.id
)
project.id

Uploading the data

titanic_dataset = project.addDataset(
    dataset_name="titanic_dataset",
    dataset_description="titanic_dataset",
    dataset_file_path="data/titanic.csv"
)

Building Model

In this step, we are training a model with some data prep and cleaning (one hot encoding and fill missing) - you can look this in the TrainTheModel notebook (inside transform folder).

model_name = "rf_model"
recipe = project.addRecipe([titanic_dataset], name="build")
template = TemplateV2(
    name="TrainTheModel", description="TrainTheModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["ML"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="TrainTheModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/TrainTheModel.ipynb")
transform = Transform()
transform.templateId = template.id
transform.name = "transform_1"
transform.variables = {
    "inputDataset": "titanic_dataset",
    "target": "Survived",
    "modelName": model_name

}
recipe.add_transform(transform)
recipe.run()

To make sure your model got created in the backend you can check using the command below

all_models = PredictionService.get_all_models()
all_models
assert model_name in all_models, "models dont match"

Predicting Model

Using the model as batch

predict_recipe = project.addRecipe([titanic_dataset], name="predict")
template = TemplateV2(
    name="PredictMLModel", description="PredictMLModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="PredictMLModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/PredictMLModel.ipynb")
transform = Transform()
transform.templateId = template.id
transform.name = "transform"
transform.variables = {
    "modelInput": titanic_dataset.name,
    "modelName": model_name
}
# predict_recipe.prepareForLocal(transform, contextId="PredictMLModel")
predict_recipe.add_transform(transform)
predict_recipe.run()
output = predict_recipe.getChildrenDatasets()['output']
output.getData()

Building Features

We are skipping this part for now, but if in your case you need feature store we can also provide it

# online_data_store = DataSource.createDataSource(
#     "online-redis",
#     DataSourceType.REDIS_STORAGE,
#     {RedisStorageConfig.HOST: "10.41.1.3", RedisStorageConfig.PORT: "6379"}
# )
# recipe = project.addRecipe([titanic_dataset], name="feature_store_sync")
# template = TemplateV2(
#     name="FeatureStoreSync", description="FeatureStoreSync", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
# )
# template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="FeatureStoreSync.ipynb"))
# template.base_transforms = [template_transform]
# template.publish("transforms/FeatureStoreSync.ipynb")
# transform = Transform()
# transform.templateId = template.id
# transform.name = "transform_1"
# transform.variables = {
#     "datasetName": titanic_dataset.name,
#     "columns": "Name,Sex,Fare",
#     "featureEntityName": "Passenger",
#     "featureEntityColumn": "PassengerId",
#     "dataSourceName": online_data_store.name
# }
# recipe.prepareForLocal(transform, "feature_store")
# recipe.add_transform(transform)
# recipe.run()
# output = recipe.getChildrenDatasets()['feature_sync_stats']
# output.getData()

Creating Predict Service

To expose the model as API you need to create a service, the code below does it for you

service_name = "RandomForestModelService"
prediction_service = PredictionService.create_service(
    name=service_name,
    description="testing purposes",
    model_name=model_name, #this is exposing the model that you have created before
    service_obj_path="prediction_services/model.py",
    env_id=None,
    data_source_ids=None
)

You can use the curl that you can see above or you can use the code below to do realtime predictions

# PredictionService.refresh_service(prediction_service.name)
new = pd.read_csv("data/titanic.csv")
t_json = new.head().to_dict()
PredictionService.predict_by_service(
    prediction_service.name,
    t_json
)