Create Template
Import Functions
from utils.notebookhelpers.helpers import Helpers
from utils.dtos.templateOutputCollection import TemplateOutputCollection
from utils.dtos.variable import Metadata
from utils.dtos.templateOutput import TemplateOutput
from utils.dtos.templateOutput import OutputType
from utils.dtos.templateOutput import ChartType
from utils.notebookhelpers.schemaHelpers import SchemaHelpers
from utils.functions import FunctionLib
Create context
contextId = 'recipe_split'
context = Helpers.getOrCreateContext(contextId=contextId, localVars=locals())
Create Input Dataset
inputDatasetParameter=Helpers.get_or_create_input_dataset(
name="inputDataset",
metadata=Metadata(
input_name='Input Dataset',
is_required=True,
tooltip='Dataset to apply the train and test split'),
local_context=locals()
)
Create input parameters
targetCol = Helpers.get_or_create_input_var(
name="targetCol",
metadata=Metadata(
input_name="Target Column",
is_required=True,
default_value="target",
tooltip="Input the target column to predict on (label y)",
datatypes=['STRING']
),
local_context=locals()
)
test_size = Helpers.get_or_create_input_var(
name="test_size",
metadata=Metadata(
input_name="Test Size",
is_required=True,
default_value= 0.2,
tooltip="Test Size, default to be 0.2, which means 80% train and 20% test split",
datatypes=['DOUBLE']
),
local_context=locals()
)
Create an output dataset parameter
outputDatasetParameter_train = Helpers.get_or_create_output_dataset(
name="output_train",
metadata=Metadata(
input_name='Output Train Dataset',
is_required=True,
default_value="train",
tooltip='Train Dataset to be created'),
local_context=locals()
)
outputDatasetParameter_test = Helpers.get_or_create_output_dataset(
name="output_test",
metadata=Metadata(
input_name='Output Test Dataset',
is_required=True,
default_value="test",
tooltip='Test Dataset to be created'),
local_context=locals()
)
Helpers.getAllParams(context)
inputDataset = inputDatasetParameter.value
targetCol = targetCol.value
test_size = test_size.value
output_train = outputDatasetParameter_train.value
output_test = outputDatasetParameter_test.value
# test_size here is the string from the server, change that to float
test_size = float(test_size)
print(inputDataset)
print(targetCol)
print(test_size)
print(output_train)
print(output_test)
raw_w_one_hot_encoding
SalePrice
0.2
train
test
inDF = Helpers.getEntityData(context, inputDataset)
# get rid of the __index_level_0 column from system
#if '__index_level_0__' in inDF:
# inDF = inDF.drop(['__index_level_0__'], axis=1)
#inDF
Build the function you want your template to execute
import pandas as pd
import numpy as np
import os
import itertools
import warnings
import re
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
X = inDF.drop(targetCol,axis=1)
y = inDF[targetCol]
X_train, X_test, y_train, y_test = train_test_split(X, y,
#stratify=y,
test_size=test_size,
random_state=42)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
train = pd.concat([X_train, y_train],axis=1)
test = pd.concat([X_test,y_test],axis=1)
# fix a weird bug adding index
if 'index' in train.columns:
train.drop(columns=['index'],inplace=True)
test.drop(columns=['index'],inplace=True)
if 'index' in test.columns:
train.drop(columns=['index'],inplace=True)
test.drop(columns=['index'],inplace=True)
Create an output collection
outputCollection = Helpers.createOutputCollection(context)
output_train = Helpers.createTemplateOutputDataset(context=context, outputName=output_train, dataFrame=train)
outputCollection.addTemplateOutput(output_train)
output_test = Helpers.createTemplateOutputDataset(context=context, outputName=output_test, dataFrame=test)
outputCollection.addTemplateOutput(output_test)
Helpers.save(context)