Build and Evaluate a Linear Risk model

UNQ_C1

# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def make_standard_normal(df_train, df_test):
    """
    In order to make the data closer to a normal distribution, take log
    transforms to reduce the skew.
    Then standardize the distribution with a mean of zero and standard deviation of 1. 

    Args:
      df_train (dataframe): unnormalized training data.
      df_test (dataframe): unnormalized test data.

    Returns:
      df_train_normalized (dateframe): normalized training data.
      df_test_normalized (dataframe): normalized test data.
    """

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###  
    # Remove skew by applying the log function to the train set, and to the test set
    df_train_unskewed = np.log(df_train)
    df_test_unskewed = np.log(df_test)

    #calculate the mean and standard deviation of the training set
    mean = df_train_unskewed.mean();
    stdev = df_train_unskewed.std(ddof=1)
    #stdev = np.std(df_train_unskewed, ddof=1)

    # standardize the training set
    df_train_standardized = (df_train_unskewed-mean)/stdev

    # standardize the test set (see instructions and hints above)
    df_test_standardized = (df_test_unskewed-mean)/stdev

    ### END CODE HERE ###
    return df_train_standardized, df_test_standardized

UNQ_C2

# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def lr_model(X_train, y_train):

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # import the LogisticRegression class
    from sklearn.linear_model import LogisticRegression

    # create the model object
    model = LogisticRegression()

    # fit the model to the training data
    model.fit(X_train, y_train)

    ### END CODE HERE ###
    #return the fitted model
    return model

UNQ_C3

# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def cindex(y_true, scores):
    '''

    Input:
    y_true (np.array): a 1-D array of true binary outcomes (values of zero or one)
        0: patient does not get the disease
        1: patient does get the disease
    scores (np.array): a 1-D array of corresponding risk scores output by the model

    Output:
    c_index (float): (concordant pairs + 0.5*ties) / number of permissible pairs
    '''
    n = len(y_true)
    assert len(scores) == n

    concordant = 0
    permissible = 0
    ties = 0

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # use two nested for loops to go through all unique pairs of patients
    for i in range(n):
        for j in range(i+1, n): #choose the range of j so that j>i

            # Check if the pair is permissible (the patient outcomes are different)
            if y_true[i]!=y_true[j]:
                # Count the pair if it's permissible
                permissible=permissible+1

                # For permissible pairs, check if they are concordant or are ties

                # check for ties in the score
                if scores[i]==scores[j]:
                    # count the tie
                    ties=ties+1
                    # if it's a tie, we don't need to check patient outcomes, continue to the top of the for loop.
                    continue

                # case 1: patient i doesn't get the disease, patient j does
                if y_true[i] == 0 and y_true[j] == 1:
                    # Check if patient i has a lower risk score than patient j
                    if scores[i]<scores[j]:
                        # count the concordant pair
                        concordant=concordant+1
                    # Otherwise if patient i has a higher risk score, it's not a concordant pair.
                    # Already checked for ties earlier

                # case 2: patient i gets the disease, patient j does not
                if y_true[i] == 1 and y_true[j] == 0:
                    # Check if patient i has a higher risk score than patient j
                    if scores[i]>scores[j]:
                        #count the concordant pair
                        concordant=concordant+1
                    # Otherwise if patient i has a lower risk score, it's not a concordant pair.
                    # We already checked for ties earlier

    # calculate the c-index using the count of permissible pairs, concordant pairs, and tied pairs.
    c_index = (concordant+0.5*ties)/permissible
    ### END CODE HERE ###

    return c_index

UNQ_C4

# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def add_interactions(X):
    """
    Add interaction terms between columns to dataframe.

    Args:
    X (dataframe): Original data

    Returns:
    X_int (dataframe): Original data with interaction terms appended. 
    """
    features = X.columns
    m = len(features)
    X_int = X.copy(deep=True)

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # 'i' loops through all features in the original dataframe X
    for i in range(m):

        # get the name of feature 'i'
        feature_i_name = features[i]

        # get the data for feature 'i'
        feature_i_data = X[feature_i_name]

        # choose the index of column 'j' to be greater than column i
        for j in range(i+1, m):

            # get the name of feature 'j'
            feature_j_name = features[j]

            # get the data for feature j'
            feature_j_data = X[feature_j_name]

            # create the name of the interaction feature by combining both names
            # example: "apple" and "orange" are combined to be "apple_x_orange"
            feature_i_j_name = f"{feature_i_name}_x_{feature_j_name}"

            # Multiply the data for feature 'i' and feature 'j'
            # store the result as a column in dataframe X_int
            X_int[feature_i_j_name] = X[feature_i_name]*X[feature_j_name]

    ### END CODE HERE ###

    return X_int

求标准差那一步一定要写成stdev = df_train_unskewed.std(ddof=1)，如果用np.std会不通过，但是效果是一样的

keevinzha

https://www.keevinzha.com/article/2022-05-23_Coursera_AMP_1

咳咳想白嫖文章？本文章著作权归作者所有，任何形式的转载都请注明出处。 https://www.keevinzha.com !

深度学习