utils.functions.FunctionLib

Module Contents

Functions

write_to_file(file_name, data_str[, append])

read_file_raw(file_name)

read_file_lines(file_name)

getColsByType(inDF[, dataType])

rename_col_wc(inDF[, wc_map, inplace])

gen_combination_sets(lst[, min_group_size, ...])

Generate combination and permutation sets for for experiment analysis

merge_prev_next_df(inDF, dim_cols_sets, cols_to_copy)

Merge Previous Event with Next Event and create a Data Frame

split_list(input_list[, num_chunks, chunk_size])

sanitize_name_str(in_str)

sanitize_dataset_col_names(input_df[, ...])

clean_column_name(text)

set_data_types(entityDf, entitySchema)

get_null_cols(df[, percent_cutoff])

get_constant_cols(df)

get constant columns, NaNs do not count as a unique value

get_topval(column[, topn, reverse])

get_entropy(column[, base])

get_datetime_cols(df[, sample_size])

function to get datetime columns in df

get_object_to_int_columns(df[, sample_size])

find the object that can be converted to int

convert_object_to_int(df, object_to_int_cols)

based on the convert to int columns

get_int_cols(df[, sample_size])

function to get the integer column using system schema

split_df(input_df[, split_ratio])

filter_datadiff(source_df, target_df, cols_to_match[, ...])

mergefiles(dfs, countfiles[, i])

check_invalid_datatype(datatype, value)

get_col_stats(input_df[, fieldschema, childDir, ...])

gen_bin_cols(input_df, childDir, col_stats, bin_col_list)

attach_bin_cols(df, childDir, col_list)

gen_counters(df, childDir, col_list)

attach_counters(df, childDir, col_list)

gen_counters_dimed(df, childDir, col_list, ...[, ...])

attach_dim_counters(df, childDir, col_list)

gen_aggregates(df[, num_cols, agg_cols, ignore_cols, ...])

attach_aggregates(df, childDir[, dim_cols, num_cols])

gen_full_dfs(input_df, childDir[, key_col, ...])

gen_ts_deltas(inDF[, ts_cols])

gen_ts_deltas_pairs(inDF, pairs_of_ts_cols)

clean_df(in_df[, entity_schema, sample_size_for_eda, ...])

clean_df_get_bad_col(in_df[, entity_schema, ...])

get_df_correlation_matrix(input_df)

get_high_correlation_cols(input_df[, threshold])

get_df_sampling(df_input[, max_rows, max_cols, sampling])

utils.functions.FunctionLib.write_to_file(file_name, data_str, append=False)
utils.functions.FunctionLib.read_file_raw(file_name)
utils.functions.FunctionLib.read_file_lines(file_name)
utils.functions.FunctionLib.getColsByType(inDF, dataType=None)
utils.functions.FunctionLib.rename_col_wc(inDF, wc_map={}, inplace=True)
utils.functions.FunctionLib.gen_combination_sets(lst, min_group_size=None, max_group_size=None, permutation_or_combination='combination')

Generate combination and permutation sets for for experiment analysis

utils.functions.FunctionLib.merge_prev_next_df(inDF, dim_cols_sets, cols_to_copy, prev_col_prefix='__prev_', next_col_prefix='__next_', attach_prev=True, attach_next=True)

Merge Previous Event with Next Event and create a Data Frame Also put in the same column previous and next event. :param inDF: _description_ :type inDF: _type_ :param dim_cols_sets: _description_ :type dim_cols_sets: _type_ :param cols_to_copy: _description_ :type cols_to_copy: _type_ :param prev_col_prefix: _description_. Defaults to “__prev_”. :type prev_col_prefix: str, optional :param next_col_prefix: _description_. Defaults to “__next_”. :type next_col_prefix: str, optional

Returns:

_description_

Return type:

_type_

utils.functions.FunctionLib.split_list(input_list, num_chunks=10, chunk_size=-1)
utils.functions.FunctionLib.sanitize_name_str(in_str)
utils.functions.FunctionLib.sanitize_dataset_col_names(input_df, sanitize_only_columns=False)
utils.functions.FunctionLib.clean_column_name(text)
utils.functions.FunctionLib.set_data_types(entityDf, entitySchema)
utils.functions.FunctionLib.get_null_cols(df, percent_cutoff=1.0)
utils.functions.FunctionLib.get_constant_cols(df)

get constant columns, NaNs do not count as a unique value input: X_train output: constant column names

utils.functions.FunctionLib.get_topval(column, topn=10, reverse=False)
utils.functions.FunctionLib.get_entropy(column, base=None)
utils.functions.FunctionLib.get_datetime_cols(df, sample_size=1000)

function to get datetime columns in df

utils.functions.FunctionLib.get_object_to_int_columns(df, sample_size=1000)

find the object that can be converted to int input: df output: the object column that can be converted to integer

utils.functions.FunctionLib.convert_object_to_int(df, object_to_int_cols)

based on the convert to int columns convert object to integers

utils.functions.FunctionLib.get_int_cols(df, sample_size=1000)

function to get the integer column using system schema

utils.functions.FunctionLib.split_df(input_df, split_ratio=0.7)
utils.functions.FunctionLib.filter_datadiff(source_df, target_df, cols_to_match, topsize=1000, max_iterations=1)
utils.functions.FunctionLib.mergefiles(dfs, countfiles, i=0)
utils.functions.FunctionLib.check_invalid_datatype(datatype, value)
utils.functions.FunctionLib.get_col_stats(input_df, fieldschema=None, childDir=None, df_name='pd_dataframe', is_lite_mode=True, sample_size=1000000)
utils.functions.FunctionLib.gen_bin_cols(input_df, childDir, col_stats, bin_col_list, label_prefix=None, num_bins=10)
utils.functions.FunctionLib.attach_bin_cols(df, childDir, col_list)
utils.functions.FunctionLib.gen_counters(df, childDir, col_list)
utils.functions.FunctionLib.attach_counters(df, childDir, col_list)
utils.functions.FunctionLib.gen_counters_dimed(df, childDir, col_list, key_count_col, dim_col, dim_classes=2)
utils.functions.FunctionLib.attach_dim_counters(df, childDir, col_list)
utils.functions.FunctionLib.gen_aggregates(df, num_cols=[], agg_cols=[], ignore_cols=[], merge_agg=True, create_child_df=False, pandas_lib=None)
utils.functions.FunctionLib.attach_aggregates(df, childDir, dim_cols=[], num_cols=[])
utils.functions.FunctionLib.gen_full_dfs(input_df, childDir, key_col='eid', target_col=None, cat_cols=[], num_cols=[], inline=True)
utils.functions.FunctionLib.gen_ts_deltas(inDF, ts_cols=[])
utils.functions.FunctionLib.gen_ts_deltas_pairs(inDF, pairs_of_ts_cols)
utils.functions.FunctionLib.clean_df(in_df, entity_schema=None, sample_size_for_eda=1000000, is_drop_null_cols=True, is_drop_constant_cols=True, null_ratio_cutoff=1.0)
utils.functions.FunctionLib.clean_df_get_bad_col(in_df, entity_schema=None, sample_size_for_eda=1000000, is_drop_null_cols=True, is_drop_constant_cols=True, null_ratio_cutoff=1.0)
utils.functions.FunctionLib.get_df_correlation_matrix(input_df)
utils.functions.FunctionLib.get_high_correlation_cols(input_df, threshold=0.9)
utils.functions.FunctionLib.get_df_sampling(df_input, max_rows=100000, max_cols=100, sampling='first')