`utils.functions.FunctionLib`

Module Contents

Functions

`write_to_file`(file_name, data_str[, append])
`read_file_raw`(file_name)
`read_file_lines`(file_name)
`getColsByType`(inDF[, dataType])
`rename_col_wc`(inDF[, wc_map, inplace])
`gen_combination_sets`(lst[, min_group_size, ...])	Generate combination and permutation sets for for experiment analysis
`merge_prev_next_df`(inDF, dim_cols_sets, cols_to_copy)	Merge Previous Event with Next Event and create a Data Frame
`split_list`(input_list[, num_chunks, chunk_size])
`sanitize_name_str`(in_str)
`sanitize_dataset_col_names`(input_df[, ...])
`clean_column_name`(text)
`set_data_types`(entityDf, entitySchema)
`get_null_cols`(df[, percent_cutoff])
`get_constant_cols`(df)	get constant columns, NaNs do not count as a unique value
`get_topval`(column[, topn, reverse])
`get_entropy`(column[, base])
`get_datetime_cols`(df[, sample_size])	function to get datetime columns in df
`get_object_to_int_columns`(df[, sample_size])	find the object that can be converted to int
`convert_object_to_int`(df, object_to_int_cols)	based on the convert to int columns
`get_int_cols`(df[, sample_size])	function to get the integer column using system schema
`split_df`(input_df[, split_ratio])
`filter_datadiff`(source_df, target_df, cols_to_match[, ...])
`mergefiles`(dfs, countfiles[, i])
`check_invalid_datatype`(datatype, value)
`get_col_stats`(input_df[, fieldschema, childDir, ...])
`gen_bin_cols`(input_df, childDir, col_stats, bin_col_list)
`attach_bin_cols`(df, childDir, col_list)
`gen_counters`(df, childDir, col_list)
`attach_counters`(df, childDir, col_list)
`gen_counters_dimed`(df, childDir, col_list, ...[, ...])
`attach_dim_counters`(df, childDir, col_list)
`gen_aggregates`(df[, num_cols, agg_cols, ignore_cols, ...])
`attach_aggregates`(df, childDir[, dim_cols, num_cols])
`gen_full_dfs`(input_df, childDir[, key_col, ...])
`gen_ts_deltas`(inDF[, ts_cols])
`gen_ts_deltas_pairs`(inDF, pairs_of_ts_cols)
`clean_df`(in_df[, entity_schema, sample_size_for_eda, ...])
`clean_df_get_bad_col`(in_df[, entity_schema, ...])
`get_df_correlation_matrix`(input_df)
`get_high_correlation_cols`(input_df[, threshold])
`get_df_sampling`(df_input[, max_rows, max_cols, sampling])

utils.functions.FunctionLib.write_to_file(file_name, data_str, append=False)

utils.functions.FunctionLib.read_file_raw(file_name)

utils.functions.FunctionLib.read_file_lines(file_name)

utils.functions.FunctionLib.getColsByType(inDF, dataType=None)

utils.functions.FunctionLib.rename_col_wc(inDF, wc_map={}, inplace=True)

utils.functions.FunctionLib.gen_combination_sets(lst, min_group_size=None, max_group_size=None, permutation_or_combination='combination'): Generate combination and permutation sets for for experiment analysis

utils.functions.FunctionLib.merge_prev_next_df(inDF, dim_cols_sets, cols_to_copy, prev_col_prefix='__prev_', next_col_prefix='__next_', attach_prev=True, attach_next=True)

Merge Previous Event with Next Event and create a Data Frame Also put in the same column previous and next event. :param inDF: _description_ :type inDF: _type_ :param dim_cols_sets: _description_ :type dim_cols_sets: _type_ :param cols_to_copy: _description_ :type cols_to_copy: _type_ :param prev_col_prefix: _description_. Defaults to “__prev_”. :type prev_col_prefix: str, optional :param next_col_prefix: _description_. Defaults to “__next_”. :type next_col_prefix: str, optional

Returns:: _description_
Return type:: _type_

utils.functions.FunctionLib.split_list(input_list, num_chunks=10, chunk_size=-1)

utils.functions.FunctionLib.sanitize_name_str(in_str)

utils.functions.FunctionLib.sanitize_dataset_col_names(input_df, sanitize_only_columns=False)

utils.functions.FunctionLib.clean_column_name(text)

utils.functions.FunctionLib.set_data_types(entityDf, entitySchema)

utils.functions.FunctionLib.get_null_cols(df, percent_cutoff=1.0)

utils.functions.FunctionLib.get_constant_cols(df): get constant columns, NaNs do not count as a unique value input: X_train output: constant column names

utils.functions.FunctionLib.get_topval(column, topn=10, reverse=False)

utils.functions.FunctionLib.get_entropy(column, base=None)

utils.functions.FunctionLib.get_datetime_cols(df, sample_size=1000): function to get datetime columns in df

utils.functions.FunctionLib.get_object_to_int_columns(df, sample_size=1000): find the object that can be converted to int input: df output: the object column that can be converted to integer

utils.functions.FunctionLib.convert_object_to_int(df, object_to_int_cols): based on the convert to int columns convert object to integers

utils.functions.FunctionLib.get_int_cols(df, sample_size=1000): function to get the integer column using system schema

utils.functions.FunctionLib.split_df(input_df, split_ratio=0.7)

utils.functions.FunctionLib.filter_datadiff(source_df, target_df, cols_to_match, topsize=1000, max_iterations=1)

utils.functions.FunctionLib.mergefiles(dfs, countfiles, i=0)

utils.functions.FunctionLib.check_invalid_datatype(datatype, value)

utils.functions.FunctionLib.get_col_stats(input_df, fieldschema=None, childDir=None, df_name='pd_dataframe', is_lite_mode=True, sample_size=1000000)

utils.functions.FunctionLib.gen_bin_cols(input_df, childDir, col_stats, bin_col_list, label_prefix=None, num_bins=10)

utils.functions.FunctionLib.attach_bin_cols(df, childDir, col_list)

utils.functions.FunctionLib.gen_counters(df, childDir, col_list)

utils.functions.FunctionLib.attach_counters(df, childDir, col_list)

utils.functions.FunctionLib.gen_counters_dimed(df, childDir, col_list, key_count_col, dim_col, dim_classes=2)

utils.functions.FunctionLib.attach_dim_counters(df, childDir, col_list)

utils.functions.FunctionLib.gen_aggregates(df, num_cols=[], agg_cols=[], ignore_cols=[], merge_agg=True, create_child_df=False, pandas_lib=None)

utils.functions.FunctionLib.attach_aggregates(df, childDir, dim_cols=[], num_cols=[])

utils.functions.FunctionLib.gen_full_dfs(input_df, childDir, key_col='eid', target_col=None, cat_cols=[], num_cols=[], inline=True)

utils.functions.FunctionLib.gen_ts_deltas(inDF, ts_cols=[])

utils.functions.FunctionLib.gen_ts_deltas_pairs(inDF, pairs_of_ts_cols)

utils.functions.FunctionLib.clean_df(in_df, entity_schema=None, sample_size_for_eda=1000000, is_drop_null_cols=True, is_drop_constant_cols=True, null_ratio_cutoff=1.0)

utils.functions.FunctionLib.clean_df_get_bad_col(in_df, entity_schema=None, sample_size_for_eda=1000000, is_drop_null_cols=True, is_drop_constant_cols=True, null_ratio_cutoff=1.0)

utils.functions.FunctionLib.get_df_correlation_matrix(input_df)

utils.functions.FunctionLib.get_high_correlation_cols(input_df, threshold=0.9)

utils.functions.FunctionLib.get_df_sampling(df_input, max_rows=100000, max_cols=100, sampling='first')

utils.functions.FunctionLib

Module Contents

Functions

`utils.functions.FunctionLib`