Build and Evaluate a Linear Risk model
UNQ_C1
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def make_standard_normal(df_train, df_test):
"""
In order to make the data closer to a normal distribution, take log
transforms to reduce the skew.
Then standardize the distribution with a mean of zero and standard deviation of 1.
Args:
df_train (dataframe): unnormalized training data.
df_test (dataframe): unnormalized test data.
Returns:
df_train_normalized (dateframe): normalized training data.
df_test_normalized (dataframe): normalized test data.
"""
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# Remove skew by applying the log function to the train set, and to the test set
df_train_unskewed = np.log(df_train)
df_test_unskewed = np.log(df_test)
#calculate the mean and standard deviation of the training set
mean = df_train_unskewed.mean();
stdev = df_train_unskewed.std(ddof=1)
#stdev = np.std(df_train_unskewed, ddof=1)
# standardize the training set
df_train_standardized = (df_train_unskewed-mean)/stdev
# standardize the test set (see instructions and hints above)
df_test_standardized = (df_test_unskewed-mean)/stdev
### END CODE HERE ###
return df_train_standardized, df_test_standardized
UNQ_C2
# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def lr_model(X_train, y_train):
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# import the LogisticRegression class
from sklearn.linear_model import LogisticRegression
# create the model object
model = LogisticRegression()
# fit the model to the training data
model.fit(X_train, y_train)
### END CODE HERE ###
#return the fitted model
return model
UNQ_C3
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def cindex(y_true, scores):
'''
Input:
y_true (np.array): a 1-D array of true binary outcomes (values of zero or one)
0: patient does not get the disease
1: patient does get the disease
scores (np.array): a 1-D array of corresponding risk scores output by the model
Output:
c_index (float): (concordant pairs + 0.5*ties) / number of permissible pairs
'''
n = len(y_true)
assert len(scores) == n
concordant = 0
permissible = 0
ties = 0
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# use two nested for loops to go through all unique pairs of patients
for i in range(n):
for j in range(i+1, n): #choose the range of j so that j>i
# Check if the pair is permissible (the patient outcomes are different)
if y_true[i]!=y_true[j]:
# Count the pair if it's permissible
permissible=permissible+1
# For permissible pairs, check if they are concordant or are ties
# check for ties in the score
if scores[i]==scores[j]:
# count the tie
ties=ties+1
# if it's a tie, we don't need to check patient outcomes, continue to the top of the for loop.
continue
# case 1: patient i doesn't get the disease, patient j does
if y_true[i] == 0 and y_true[j] == 1:
# Check if patient i has a lower risk score than patient j
if scores[i]<scores[j]:
# count the concordant pair
concordant=concordant+1
# Otherwise if patient i has a higher risk score, it's not a concordant pair.
# Already checked for ties earlier
# case 2: patient i gets the disease, patient j does not
if y_true[i] == 1 and y_true[j] == 0:
# Check if patient i has a higher risk score than patient j
if scores[i]>scores[j]:
#count the concordant pair
concordant=concordant+1
# Otherwise if patient i has a lower risk score, it's not a concordant pair.
# We already checked for ties earlier
# calculate the c-index using the count of permissible pairs, concordant pairs, and tied pairs.
c_index = (concordant+0.5*ties)/permissible
### END CODE HERE ###
return c_index
UNQ_C4
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def add_interactions(X):
"""
Add interaction terms between columns to dataframe.
Args:
X (dataframe): Original data
Returns:
X_int (dataframe): Original data with interaction terms appended.
"""
features = X.columns
m = len(features)
X_int = X.copy(deep=True)
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# 'i' loops through all features in the original dataframe X
for i in range(m):
# get the name of feature 'i'
feature_i_name = features[i]
# get the data for feature 'i'
feature_i_data = X[feature_i_name]
# choose the index of column 'j' to be greater than column i
for j in range(i+1, m):
# get the name of feature 'j'
feature_j_name = features[j]
# get the data for feature j'
feature_j_data = X[feature_j_name]
# create the name of the interaction feature by combining both names
# example: "apple" and "orange" are combined to be "apple_x_orange"
feature_i_j_name = f"{feature_i_name}_x_{feature_j_name}"
# Multiply the data for feature 'i' and feature 'j'
# store the result as a column in dataframe X_int
X_int[feature_i_j_name] = X[feature_i_name]*X[feature_j_name]
### END CODE HERE ###
return X_int
求标准差那一步一定要写成stdev = df_train_unskewed.std(ddof=1),如果用np.std会不通过,但是效果是一样的