Spaceship Tiantic with 0.81295 Accuracy¶

1. 导入包和数据¶

In [434]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler, StandardScaler
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")
In [435]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

wdata = pd.concat([train_dataset.drop('Transported', axis = 1), test_dataset], axis=0, sort=False)

2. 数据预处理¶

In [436]:
train_dataset.head(5)
Out[436]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name Transported
0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy False
1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 Juanna Vines True
2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 Altark Susent False
3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 Solam Susent False
4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 Willy Santantines True
In [438]:
def fill_nan(df):
    # for columns which has 0 values in 'Age', fill its 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' with 0
    df["RoomService"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["RoomService"])
    df["FoodCourt"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["FoodCourt"])
    df["ShoppingMall"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["ShoppingMall"])
    df["Spa"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["Spa"])
    df["VRDeck"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VRDeck"])
    df["VIP"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VIP"])

    # fill categorical columns with mode
    categorial_columns = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']
    for col in categorial_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    ill_columns = ['Name', 'Cabin']
    for col in ill_columns:
        df[col].fillna(method='bfill', inplace=True)

    # fill numerical columns with mean
    numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in numerical_columns:
        df[col] = df[col].fillna(0)

    df['Age'] = df['Age'].fillna(df[df['Age'] < 61]['Age'].mean())

    return df

# train_dataset = fill_nan(train_dataset)
# test_dataset = fill_nan(test_dataset)
wdata = fill_nan(wdata)
print(wdata.isnull().sum())
# print(train_dataset.isnull().sum())
PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64
In [439]:
def data_preprocessing(df):
    # boolean columns to int
    df['CryoSleep'] = df['CryoSleep'].astype(int)
    df['VIP'] = df['VIP'].astype(int)

    # Cabin has three values, build three columns to show three different values
    df[['CabinDeck',  'CabinNumber', 'CabinSide']]=df['Cabin'].str.split("/", expand=True)
    df['CabinDeckA'] = (df['CabinDeck'] == 'A').astype(int)
    df['CabinDeckG'] = (df['CabinDeck'] == 'G').astype(int)
    df['CabinDeckT'] = (df['CabinDeck'] == 'T').astype(int)
    df['CabinDeckB'] = (df['CabinDeck'] == 'B').astype(int)
    df['CabinDeckC'] = (df['CabinDeck'] == 'C').astype(int)
    df['CabinDeckD'] = (df['CabinDeck'] == 'D').astype(int)
    df['CabinDeckE'] = (df['CabinDeck'] == 'E').astype(int)
    df['CabinDeckF'] = (df['CabinDeck'] == 'F').astype(int)
    # # if the cabin is B or C, then CabinDeckBC = 1
    # df['CabinDeckBC'] = ((df['CabinDeck'] == 'B') | (df['CabinDeck'] == 'C')).astype(int)
    # # if the cabin is D or E or F, then CabinDeckDEF = 1
    # df['CabinDeckDEF'] = ((df['CabinDeck'] == 'D') | (df['CabinDeck'] == 'E') | (df['CabinDeck'] == 'F')).astype(int)

    # df['CabinSide'] = df['CabinSide'].replace({'P': 1, 'S': 2})
    df['CabinSideP'] = (df['CabinSide'] == 'P').astype(int)
    df['CabinSideS'] = (df['CabinSide'] == 'S').astype(int)

    # Homeplant has 3 values, replace three different values with 1,2,3
    df['HomePlanet'] = df['HomePlanet'].replace({'Earth': 1, 'Mars': 2, 'Europa': 3})
    # df['HomePlanetE'] = (df['HomePlanet'] == 'Earth').astype(int)
    # df['HomePlanetM'] = (df['HomePlanet'] == 'Mars').astype(int)
    # df['HomePlanetEu'] = (df['HomePlanet'] == 'Europa').astype(int)

    # Destination has 3 values, build three columns to show three different values
    df['Destination'] = df['Destination'].replace({'TRAPPIST-1e': 1, '55 Cancri e': 2, 'PSO J318.5-22': 3})

    # sum money spent on all items: RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
    df['TotalSpent'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    # df['TotslSpent'] = np.log(df['TotalSpent'] + 1)
    df['Luxury'] = df['RoomService'] + df['Spa'] + df['VRDeck']
    df['Normal'] = df['FoodCourt'] + df['ShoppingMall']
    # df['Spent'] = (df['TotalSpent'] > 0).astype(int)
    
    # append feature which is about if the passenger is alone
    df['PassengerId_group'] = df['PassengerId'].str[0:3]
    df['Group_size'] = df['PassengerId_group'].map(df['PassengerId_group'].value_counts())
    df['IsAlone'] = (df['Group_size'] == 1).astype(int)

    df["FamilyName"] = df["Name"].str.split(' ', expand = True)[1]
    df['Family_size'] = df['FamilyName'].map(df['FamilyName'].value_counts())
    df['NoFamily'] = (df['Family_size'] == 1).astype(int)

    # add column which is about if the passenger is below 4
    # df['Age_below_4'] = (df['Age'] < 4).astype(int)
    # df['Aeg_below_19'] = (df['Age'] < 19).astype(int)

    # drop columns
    df = df.drop(['PassengerId', 'PassengerId_group', 'FamilyName', 'Name', 'Group_size', 'Family_size',
                  'Cabin', 'Destination', 'HomePlanet', 'CabinDeck', 'CabinNumber', 'CabinSide'], axis=1)
    
    # df = df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

    return df

# train_dataset = data_preprocessing(train_dataset)
# test_dataset_features = data_preprocessing(test_dataset)
# train_dataset_features = train_dataset.drop('Transported', axis=1)

wdata = data_preprocessing(wdata)
wdata.head(5)
Out[439]:
CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck CabinDeckA CabinDeckG ... CabinDeckD CabinDeckE CabinDeckF CabinSideP CabinSideS TotalSpent Luxury Normal IsAlone NoFamily
0 0 39.0 0 0.0 0.0 0.0 0.0 0.0 0 0 ... 0 0 0 1 0 0.0 0.0 0.0 0 0
1 0 24.0 0 109.0 9.0 25.0 549.0 44.0 0 0 ... 0 0 1 0 1 736.0 702.0 34.0 0 0
2 0 58.0 1 43.0 3576.0 0.0 6715.0 49.0 1 0 ... 0 0 0 0 1 10383.0 6807.0 3576.0 0 0
3 0 33.0 0 0.0 1283.0 371.0 3329.0 193.0 1 0 ... 0 0 0 0 1 5176.0 3522.0 1654.0 0 0
4 0 16.0 0 303.0 70.0 151.0 565.0 2.0 0 0 ... 0 0 1 0 1 1091.0 870.0 221.0 0 0

5 rows × 23 columns

In [440]:
rs=RobustScaler()
ss=StandardScaler()
# wdata = rs.fit_transform(wdata)
wdata = ss.fit_transform(wdata)
In [441]:
train_dataset_features = wdata[:len(train_dataset)]
test_dataset_features = wdata[len(train_dataset):]

train_dataset_labels = train_dataset['Transported'].astype(int)

# train_dataset_features, val_dataset_features = train_test_split(train_dataset_features, test_size=0.2, random_state=42)
# train_dataset_labels, val_dataset_labels = train_test_split(train_dataset_labels, test_size=0.2, random_state=42)

print(train_dataset_features.shape)
# train_dataset_features.head(5)
(8693, 23)

3. 开始分类训练¶

决策树分类¶

In [445]:
# from sklearn.tree import DecisionTreeClassifier
# dt = DecisionTreeClassifier()
# dt.fit(train_dataset_features_pca, train_dataset_labels)
# dt_prediction = dt.predict(test_dataset_features_pca).astype(bool)
# output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': dt_prediction})
# output.to_csv('submission.csv', index=False)

逻辑斯蒂回归¶

In [446]:
# Logistic Regression
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: ", lr.score(train_dataset_features_pca, train_dataset_labels))
# print("This is the score of the validation set: ", lr.score(val_dataset_features_pca, val_dataset_labels))

# lr_prediction = lr.predict(test_dataset_features_pca).astype(bool)
# lr_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': lr_prediction})
# lr_output.to_csv('lr_submission.csv', index=False)

SVM分类¶

In [447]:
# from sklearn.svm import SVC
# svc = SVC(kernel='rbf')
# svc.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(svc.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(svc.score(val_dataset_features_pca, val_dataset_labels)))

# svc_prediction = svc.predict(test_dataset_features_pca).astype(bool)
# svc_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': svc_prediction})
# svc_output.to_csv('svc_submission.csv', index=False)

多层感知机分类¶

In [448]:
# # mlp
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 70), activation='relu', solver='adam', learning_rate='adaptive', batch_size=16, alpha=1)
# mlp.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(mlp.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(mlp.score(val_dataset_features_pca, val_dataset_labels)))

# mlp_prediction = mlp.predict(test_dataset_features_pca).astype(bool)
# mlp_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': mlp_prediction})
# mlp_output.to_csv('mlp_submission.csv', index=False)

集成学习¶

In [449]:
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=10, max_depth=20, min_samples_leaf=20)
# rf.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(rf.score(val_dataset_features_pca, val_dataset_labels)))

# rf_prediction = rf.predict(test_dataset_features_pca).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)
In [450]:
# rf = RandomForestClassifier(n_estimators=60, max_depth=20, min_samples_leaf=20, criterion='entropy')
# rf.fit(train_dataset_features, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features, train_dataset_labels)))
# # print("This is the score of the validation set: " + str(rf.score(val_dataset_features, val_dataset_labels)))

# rf_prediction = rf.predict(test_dataset_features).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)

Catboost¶

In [451]:
# import optuna
# from sklearn.model_selection import KFold, cross_val_score

# kf = KFold(n_splits = 10)
# def objective(trial):
    
#     params = {
#          'iterations': trial.suggest_int("iterations", 50, 1000),
#         'learning_rate': trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         'depth': trial.suggest_int("depth", 4, 10),
#         'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
#         'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         'bagging_temperature':trial.suggest_float("bagging_temperature", 0.0, 10.0),
#         'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
#         'od_wait':trial.suggest_int("od_wait", 10, 50),
#     }
    
#     clf = CatBoostClassifier(**params, verbose = 0, random_seed = 0)
#     clf.fit(train_dataset_features_pca, train_dataset_labels)
    
#     scores = cross_val_score(clf, train_dataset_features_pca, train_dataset_labels,
#                              cv = kf, scoring = 'accuracy', n_jobs=-1)
    
#     return np.mean(scores)


# study = optuna.create_study(direction = 'maximize')
# study.optimize(objective, n_trials = 100)

# print('Best hyperparameters:', study.best_params)
# print('Best Acuuracy:', study.best_value)

上面的代码是训练参数,下面得到最好的参数¶

In [452]:
param = {'iterations': 997, 'learning_rate': 0.006179823417619039, 'depth': 5, 'l2_leaf_reg': 0.015049126492951247, 'bootstrap_type': 'Bayesian', 'random_strength': 0.0031066667545993952, 'bagging_temperature': 0.45869966946262664, 'od_type': 'IncToDec', 'od_wait': 42}
In [454]:
cat = CatBoostClassifier(**param, eval_metric='Accuracy', verbose=0)
cat.fit(train_dataset_features, train_dataset_labels)
print("This is the score of the training set: " + str(cat.score(train_dataset_features, train_dataset_labels)))
# print("This is the score of the validation set: " + str(cat.score(val_dataset_features, val_dataset_labels)))
This is the score of the training set: 0.8253767399056712

线性鉴别分析¶

In [457]:
# # LDA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# lda = LinearDiscriminantAnalysis()
# lda.fit(train_dataset_features_pca, train_dataset_labels)
# lda.score(val_dataset_features_pca, val_dataset_labels)

神经网络¶

In [458]:
# from tensorflow import keras
# from keras import layers

# model = keras.Sequential([
#     layers.Dense(10, activation='relu', input_shape=[3]),
#     layers.Dropout(0.3),
#     layers.Dense(20, activation='relu'),
#     # dropout
#     layers.Dropout(0.2),
#     layers.Dense(1, activation='sigmoid'),
# ])

# model.compile(
#     optimizer='adam',
#     loss='binary_crossentropy',
#     metrics=['binary_accuracy'],
# )

# history = model.fit(
#     train_dataset_features_pca, train_dataset_labels,
#     validation_data=(val_dataset_features_pca, val_dataset_labels),
#     batch_size=256,
#     epochs=1000,
#     verbose=0, # hide the output because we have so many epochs
# )

# history_df = pd.DataFrame(history.history)
# # Start the plot at epoch 5
# history_df.loc[5:, ['loss', 'val_loss']].plot()
# history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()

# print(("Best Validation Loss: {:0.4f}" +\
#       "\nBest Validation Accuracy: {:0.4f}")\
#       .format(history_df['val_loss'].min(), 
#               history_df['val_binary_accuracy'].max()))

4. Output¶

In [459]:
cat_prediction = cat.predict(test_dataset_features).astype(bool)
cat_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': cat_prediction})
cat_output.to_csv('cat_submission.csv', index=False)