import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler, StandardScaler
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

wdata = pd.concat([train_dataset.drop('Transported', axis = 1), test_dataset], axis=0, sort=False)

train_dataset.head(5)

def fill_nan(df):
    # for columns which has 0 values in 'Age', fill its 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' with 0
    df["RoomService"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["RoomService"])
    df["FoodCourt"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["FoodCourt"])
    df["ShoppingMall"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["ShoppingMall"])
    df["Spa"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["Spa"])
    df["VRDeck"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VRDeck"])
    df["VIP"] = np.where((df["Age"] < 13) | (df["CryoSleep"] == True), 0, df["VIP"])

    # fill categorical columns with mode
    categorial_columns = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']
    for col in categorial_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    ill_columns = ['Name', 'Cabin']
    for col in ill_columns:
        df[col].fillna(method='bfill', inplace=True)

    # fill numerical columns with mean
    numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in numerical_columns:
        df[col] = df[col].fillna(0)

    df['Age'] = df['Age'].fillna(df[df['Age'] < 61]['Age'].mean())

    return df

# train_dataset = fill_nan(train_dataset)
# test_dataset = fill_nan(test_dataset)
wdata = fill_nan(wdata)
print(wdata.isnull().sum())
# print(train_dataset.isnull().sum())

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

def data_preprocessing(df):
    # boolean columns to int
    df['CryoSleep'] = df['CryoSleep'].astype(int)
    df['VIP'] = df['VIP'].astype(int)

    # Cabin has three values, build three columns to show three different values
    df[['CabinDeck',  'CabinNumber', 'CabinSide']]=df['Cabin'].str.split("/", expand=True)
    df['CabinDeckA'] = (df['CabinDeck'] == 'A').astype(int)
    df['CabinDeckG'] = (df['CabinDeck'] == 'G').astype(int)
    df['CabinDeckT'] = (df['CabinDeck'] == 'T').astype(int)
    df['CabinDeckB'] = (df['CabinDeck'] == 'B').astype(int)
    df['CabinDeckC'] = (df['CabinDeck'] == 'C').astype(int)
    df['CabinDeckD'] = (df['CabinDeck'] == 'D').astype(int)
    df['CabinDeckE'] = (df['CabinDeck'] == 'E').astype(int)
    df['CabinDeckF'] = (df['CabinDeck'] == 'F').astype(int)
    # # if the cabin is B or C, then CabinDeckBC = 1
    # df['CabinDeckBC'] = ((df['CabinDeck'] == 'B') | (df['CabinDeck'] == 'C')).astype(int)
    # # if the cabin is D or E or F, then CabinDeckDEF = 1
    # df['CabinDeckDEF'] = ((df['CabinDeck'] == 'D') | (df['CabinDeck'] == 'E') | (df['CabinDeck'] == 'F')).astype(int)

    # df['CabinSide'] = df['CabinSide'].replace({'P': 1, 'S': 2})
    df['CabinSideP'] = (df['CabinSide'] == 'P').astype(int)
    df['CabinSideS'] = (df['CabinSide'] == 'S').astype(int)

    # Homeplant has 3 values, replace three different values with 1,2,3
    df['HomePlanet'] = df['HomePlanet'].replace({'Earth': 1, 'Mars': 2, 'Europa': 3})
    # df['HomePlanetE'] = (df['HomePlanet'] == 'Earth').astype(int)
    # df['HomePlanetM'] = (df['HomePlanet'] == 'Mars').astype(int)
    # df['HomePlanetEu'] = (df['HomePlanet'] == 'Europa').astype(int)

    # Destination has 3 values, build three columns to show three different values
    df['Destination'] = df['Destination'].replace({'TRAPPIST-1e': 1, '55 Cancri e': 2, 'PSO J318.5-22': 3})

    # sum money spent on all items: RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
    df['TotalSpent'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    # df['TotslSpent'] = np.log(df['TotalSpent'] + 1)
    df['Luxury'] = df['RoomService'] + df['Spa'] + df['VRDeck']
    df['Normal'] = df['FoodCourt'] + df['ShoppingMall']
    # df['Spent'] = (df['TotalSpent'] > 0).astype(int)
    
    # append feature which is about if the passenger is alone
    df['PassengerId_group'] = df['PassengerId'].str[0:3]
    df['Group_size'] = df['PassengerId_group'].map(df['PassengerId_group'].value_counts())
    df['IsAlone'] = (df['Group_size'] == 1).astype(int)

    df["FamilyName"] = df["Name"].str.split(' ', expand = True)[1]
    df['Family_size'] = df['FamilyName'].map(df['FamilyName'].value_counts())
    df['NoFamily'] = (df['Family_size'] == 1).astype(int)

    # add column which is about if the passenger is below 4
    # df['Age_below_4'] = (df['Age'] < 4).astype(int)
    # df['Aeg_below_19'] = (df['Age'] < 19).astype(int)

    # drop columns
    df = df.drop(['PassengerId', 'PassengerId_group', 'FamilyName', 'Name', 'Group_size', 'Family_size',
                  'Cabin', 'Destination', 'HomePlanet', 'CabinDeck', 'CabinNumber', 'CabinSide'], axis=1)
    
    # df = df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

    return df

# train_dataset = data_preprocessing(train_dataset)
# test_dataset_features = data_preprocessing(test_dataset)
# train_dataset_features = train_dataset.drop('Transported', axis=1)

wdata = data_preprocessing(wdata)
wdata.head(5)

rs=RobustScaler()
ss=StandardScaler()
# wdata = rs.fit_transform(wdata)
wdata = ss.fit_transform(wdata)

train_dataset_features = wdata[:len(train_dataset)]
test_dataset_features = wdata[len(train_dataset):]

train_dataset_labels = train_dataset['Transported'].astype(int)

# train_dataset_features, val_dataset_features = train_test_split(train_dataset_features, test_size=0.2, random_state=42)
# train_dataset_labels, val_dataset_labels = train_test_split(train_dataset_labels, test_size=0.2, random_state=42)

print(train_dataset_features.shape)
# train_dataset_features.head(5)

(8693, 23)

# from sklearn.tree import DecisionTreeClassifier
# dt = DecisionTreeClassifier()
# dt.fit(train_dataset_features_pca, train_dataset_labels)
# dt_prediction = dt.predict(test_dataset_features_pca).astype(bool)
# output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': dt_prediction})
# output.to_csv('submission.csv', index=False)

# Logistic Regression
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: ", lr.score(train_dataset_features_pca, train_dataset_labels))
# print("This is the score of the validation set: ", lr.score(val_dataset_features_pca, val_dataset_labels))

# lr_prediction = lr.predict(test_dataset_features_pca).astype(bool)
# lr_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': lr_prediction})
# lr_output.to_csv('lr_submission.csv', index=False)

# from sklearn.svm import SVC
# svc = SVC(kernel='rbf')
# svc.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(svc.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(svc.score(val_dataset_features_pca, val_dataset_labels)))

# svc_prediction = svc.predict(test_dataset_features_pca).astype(bool)
# svc_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': svc_prediction})
# svc_output.to_csv('svc_submission.csv', index=False)

# # mlp
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 70), activation='relu', solver='adam', learning_rate='adaptive', batch_size=16, alpha=1)
# mlp.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(mlp.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(mlp.score(val_dataset_features_pca, val_dataset_labels)))

# mlp_prediction = mlp.predict(test_dataset_features_pca).astype(bool)
# mlp_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': mlp_prediction})
# mlp_output.to_csv('mlp_submission.csv', index=False)

# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=10, max_depth=20, min_samples_leaf=20)
# rf.fit(train_dataset_features_pca, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features_pca, train_dataset_labels)))
# print("This is the score of the validation set: " + str(rf.score(val_dataset_features_pca, val_dataset_labels)))

# rf_prediction = rf.predict(test_dataset_features_pca).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)

# rf = RandomForestClassifier(n_estimators=60, max_depth=20, min_samples_leaf=20, criterion='entropy')
# rf.fit(train_dataset_features, train_dataset_labels)
# print("This is the score of the training set: " + str(rf.score(train_dataset_features, train_dataset_labels)))
# # print("This is the score of the validation set: " + str(rf.score(val_dataset_features, val_dataset_labels)))

# rf_prediction = rf.predict(test_dataset_features).astype(bool)
# rf_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': rf_prediction})
# rf_output.to_csv('rf_submission.csv', index=False)

# import optuna
# from sklearn.model_selection import KFold, cross_val_score

# kf = KFold(n_splits = 10)
# def objective(trial):
    
#     params = {
#          'iterations': trial.suggest_int("iterations", 50, 1000),
#         'learning_rate': trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         'depth': trial.suggest_int("depth", 4, 10),
#         'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
#         'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         'bagging_temperature':trial.suggest_float("bagging_temperature", 0.0, 10.0),
#         'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
#         'od_wait':trial.suggest_int("od_wait", 10, 50),
#     }
    
#     clf = CatBoostClassifier(**params, verbose = 0, random_seed = 0)
#     clf.fit(train_dataset_features_pca, train_dataset_labels)
    
#     scores = cross_val_score(clf, train_dataset_features_pca, train_dataset_labels,
#                              cv = kf, scoring = 'accuracy', n_jobs=-1)
    
#     return np.mean(scores)


# study = optuna.create_study(direction = 'maximize')
# study.optimize(objective, n_trials = 100)

# print('Best hyperparameters:', study.best_params)
# print('Best Acuuracy:', study.best_value)

param = {'iterations': 997, 'learning_rate': 0.006179823417619039, 'depth': 5, 'l2_leaf_reg': 0.015049126492951247, 'bootstrap_type': 'Bayesian', 'random_strength': 0.0031066667545993952, 'bagging_temperature': 0.45869966946262664, 'od_type': 'IncToDec', 'od_wait': 42}

cat = CatBoostClassifier(**param, eval_metric='Accuracy', verbose=0)
cat.fit(train_dataset_features, train_dataset_labels)
print("This is the score of the training set: " + str(cat.score(train_dataset_features, train_dataset_labels)))
# print("This is the score of the validation set: " + str(cat.score(val_dataset_features, val_dataset_labels)))

This is the score of the training set: 0.8253767399056712

# # LDA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# lda = LinearDiscriminantAnalysis()
# lda.fit(train_dataset_features_pca, train_dataset_labels)
# lda.score(val_dataset_features_pca, val_dataset_labels)

# from tensorflow import keras
# from keras import layers

# model = keras.Sequential([
#     layers.Dense(10, activation='relu', input_shape=[3]),
#     layers.Dropout(0.3),
#     layers.Dense(20, activation='relu'),
#     # dropout
#     layers.Dropout(0.2),
#     layers.Dense(1, activation='sigmoid'),
# ])

# model.compile(
#     optimizer='adam',
#     loss='binary_crossentropy',
#     metrics=['binary_accuracy'],
# )

# history = model.fit(
#     train_dataset_features_pca, train_dataset_labels,
#     validation_data=(val_dataset_features_pca, val_dataset_labels),
#     batch_size=256,
#     epochs=1000,
#     verbose=0, # hide the output because we have so many epochs
# )

# history_df = pd.DataFrame(history.history)
# # Start the plot at epoch 5
# history_df.loc[5:, ['loss', 'val_loss']].plot()
# history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()

# print(("Best Validation Loss: {:0.4f}" +\
#       "\nBest Validation Accuracy: {:0.4f}")\
#       .format(history_df['val_loss'].min(), 
#               history_df['val_binary_accuracy'].max()))

cat_prediction = cat.predict(test_dataset_features).astype(bool)
cat_output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Transported': cat_prediction})
cat_output.to_csv('cat_submission.csv', index=False)

	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	CabinDeckA	...	CabinDeckF	CabinSideP	CabinSideS	TotalSpent	Luxury	Normal
0	39.0	0	0.0	0.0	0.0	0.0	0.0	0	...	0	1	0	0.0	0.0	0.0
1	24.0	0	109.0	9.0	25.0	549.0	44.0	0	...	1	0	1	736.0	702.0	34.0
2	58.0	1	43.0	3576.0	0.0	6715.0	49.0	1	...	0	0	1	10383.0	6807.0	3576.0
3	33.0	0	0.0	1283.0	371.0	3329.0	193.0	1	...	0	0	1	5176.0	3522.0	1654.0
4	16.0	0	303.0	70.0	151.0	565.0	2.0	0	...	1	0	1	1091.0	870.0	221.0

Spaceship Tiantic with 0.81295 Accuracy¶

1. 导入包和数据¶

2. 数据预处理¶

3. 开始分类训练¶

决策树分类¶

逻辑斯蒂回归¶

SVM分类¶

多层感知机分类¶

集成学习¶

Catboost¶

上面的代码是训练参数，下面得到最好的参数¶

线性鉴别分析¶

神经网络¶

4. Output¶

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name	Transported
0	0001_01	Europa	False	B/0/P	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	Maham Ofracculy	False
1	0002_01	Earth	False	F/0/S	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	Juanna Vines	True
2	0003_01	Europa	False	A/0/S	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	Altark Susent	False
3	0003_02	Europa	False	A/0/S	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	Solam Susent	False
4	0004_01	Earth	False	F/1/S	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	Willy Santantines	True