Data Science Project Cheat Sheet

Data Profile

Summary info

def singlevalue(my_series):
    """
    input: pd.series
    output: the number of single values
    """
    value = len(my_series.value_counts()[my_series.value_counts() == 1])
    return value

def df_explore(data_explore):
    """
    input: pandas dataframe
    output: 
    1. the shape of the data frame
    2. the number of unique value in each column
    3. the percentage of null value in each column
    4. the number of single value in each column
    5. the data type of each column
    """
    print ("1. The shape of the data is " + str(data_explore.shape))
    data_explore_unique = pd.DataFrame(data_explore.apply(lambda x: x.nunique(), axis = 0))
    data_explore_unique.rename(columns={0:'nunique'}, inplace=True)
    data_explore_null = pd.DataFrame(data_explore.isnull().sum()/len(data_explore)*100)
    data_explore_null.rename(columns={0:'null%'}, inplace=True)
    data_explore_single = pd.DataFrame(data_explore.apply(lambda x: singlevalue(x), axis = 0))
    data_explore_single.rename(columns={0:'nsingle'}, inplace=True)
    data_explore_types = data_explore.dtypes.to_frame()
    data_explore_types.rename(columns={0:'dtypes'}, inplace=True)
    data_explore_result = data_explore_unique.join(data_explore_null, how='outer').\
    join(data_explore_single, how='outer').join(data_explore_types, how = 'outer')
    return data_explore_result

Missing values

sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Continous Variable

Empirical cumulative distribution functions (ECDF)

def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, (n+1)) / n

    return x, y
# Compute ECDFs
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)

# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')

# Annotate the plot
plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

Pre-processing

  • Drop na in one column: drop all the rows if columns 'Salary' is null.
drop_sal_df = num_vars.dropna(subset = ['Salary'])
  • Fill all missing values with the mean of the column
fill_mean = lambda col: col.fillna(col.mean())
fill_df = drop_sal_df.apply(fill_mean, axis = 0)

Categorical

  • Create a dummy variable with na as a new column
dummy_cols_df = pd.get_dummies(dummy_var_df['col1'], dummy_na = True)
cat_df = df.select_dtypes(include=['object']).copy()
#Create a copy of the dataframe
cat_df_copy = cat_df.copy()
#Pull a list of the column names of the categorical variables
cat_cols_lst = cat_df.columns

def create_dummy_df(df, cat_cols, dummy_na):
    """
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not

    OUTPUT:
    df - a new dataframe that has the following characteristics:
    1. contains all columns that were not specified as categorical
    2. removes all the original columns in cat_cols
    3. dummy columns for each of the categorical columns in cat_cols
    4. if dummy_na is True - it also contains dummy columns for the NaN values
    5. Use a prefix of the column name with an underscore (_) for separating 
    """
    cat_df = pd.get_dummies(df[cat_cols], prefix = cat_cols, prefix_sep='_', dummy_na = dummy_na, drop_first=True)
    df_copy = df.drop(cat_cols, axis = 1)
    df_copy = df_copy.merge(cat_df, how = 'inner', left_index = True, right_index = True)

    return df_copy

Linear Regression

  • Numeric variables
X = fill_df[['CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']]
y = fill_df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

y_test_preds = lm_model.predict(X_test) 

rsquared_score = r2_score(y_test, y_test_preds) #r2_score
length_y_test = len(y_test) #num in y_test

"The r-squared score for your model was {} on {} values.".format(rsquared_score, length_y_test)

Pages