Data Profile
Summary info
def singlevalue(my_series):
"""
input: pd.series
output: the number of single values
"""
value = len(my_series.value_counts()[my_series.value_counts() == 1])
return value
def df_explore(data_explore):
"""
input: pandas dataframe
output:
1. the shape of the data frame
2. the number of unique value in each column
3. the percentage of null value in each column
4. the number of single value in each column
5. the data type of each column
"""
print ("1. The shape of the data is " + str(data_explore.shape))
data_explore_unique = pd.DataFrame(data_explore.apply(lambda x: x.nunique(), axis = 0))
data_explore_unique.rename(columns={0:'nunique'}, inplace=True)
data_explore_null = pd.DataFrame(data_explore.isnull().sum()/len(data_explore)*100)
data_explore_null.rename(columns={0:'null%'}, inplace=True)
data_explore_single = pd.DataFrame(data_explore.apply(lambda x: singlevalue(x), axis = 0))
data_explore_single.rename(columns={0:'nsingle'}, inplace=True)
data_explore_types = data_explore.dtypes.to_frame()
data_explore_types.rename(columns={0:'dtypes'}, inplace=True)
data_explore_result = data_explore_unique.join(data_explore_null, how='outer').\
join(data_explore_single, how='outer').join(data_explore_types, how = 'outer')
return data_explore_result
Missing values
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Continous Variable
Empirical cumulative distribution functions (ECDF)
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, (n+1)) / n
return x, y
# Compute ECDFs
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)
# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')
# Annotate the plot
plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')
Pre-processing
- Drop na in one column: drop all the rows if columns 'Salary' is null.
drop_sal_df = num_vars.dropna(subset = ['Salary'])
- Fill all missing values with the mean of the column
fill_mean = lambda col: col.fillna(col.mean())
fill_df = drop_sal_df.apply(fill_mean, axis = 0)
Categorical
- Create a dummy variable with na as a new column
dummy_cols_df = pd.get_dummies(dummy_var_df['col1'], dummy_na = True)
cat_df = df.select_dtypes(include=['object']).copy()
#Create a copy of the dataframe
cat_df_copy = cat_df.copy()
#Pull a list of the column names of the categorical variables
cat_cols_lst = cat_df.columns
def create_dummy_df(df, cat_cols, dummy_na):
"""
INPUT:
df - pandas dataframe with categorical variables you want to dummy
cat_cols - list of strings that are associated with names of the categorical columns
dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
OUTPUT:
df - a new dataframe that has the following characteristics:
1. contains all columns that were not specified as categorical
2. removes all the original columns in cat_cols
3. dummy columns for each of the categorical columns in cat_cols
4. if dummy_na is True - it also contains dummy columns for the NaN values
5. Use a prefix of the column name with an underscore (_) for separating
"""
cat_df = pd.get_dummies(df[cat_cols], prefix = cat_cols, prefix_sep='_', dummy_na = dummy_na, drop_first=True)
df_copy = df.drop(cat_cols, axis = 1)
df_copy = df_copy.merge(cat_df, how = 'inner', left_index = True, right_index = True)
return df_copy
Linear Regression
X = fill_df[['CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']]
y = fill_df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
y_test_preds = lm_model.predict(X_test)
rsquared_score = r2_score(y_test, y_test_preds) #r2_score
length_y_test = len(y_test) #num in y_test
"The r-squared score for your model was {} on {} values.".format(rsquared_score, length_y_test)