import pandas as pd
import os
# change your path
data_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project'
test_data = pd.read_csv(r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\test.csv')
submission_file_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\submission.csv'
zone_feature_files_paths = {
"Chukyo": os.path.join(data_path, "Zone_features", "Chukyo_zone_feature_area.csv"),
"Higashisurugawan": os.path.join(data_path, "Zone_features", "Higashisurugawan_zone_feature_area.csv"),
"Kinki": os.path.join(data_path, "Zone_features", "Kinki_zone_feature_area.csv"),
"Kyushu": os.path.join(data_path, "Zone_features", "Kyushu_zone_feature_area.csv"),
"Tokyo": os.path.join(data_path, "Zone_features", "Tokyo_zone_feature_area.csv")
}
train_files_paths = {
"Chukyo": os.path.join(data_path, "train", "Chukyo.csv"),
"Higashisurugawan": os.path.join(data_path, "train", "Higashisurugawan.csv"),
"Kyushu": os.path.join(data_path, "train", "Kyushu.csv"),
"Tokyo": os.path.join(data_path, "train", "Tokyo.csv")
}
# Load the data
zone_feature_files = {name: pd.read_csv(path) for name, path in zone_feature_files_paths.items()}
train_files = {name: pd.read_csv(path) for name, path in train_files_paths.items()}
# Missing values
def check_missing_values(dataframes):
missing_values = {}
for name, df in dataframes.items():
missing = df.isnull().sum()
missing_values[name] = missing[missing > 0]
return missing_values
missing_zone_features = check_missing_values(zone_feature_files)
missing_train_data = check_missing_values(train_files)
missing_zone_features, missing_train_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
def merge_zone_features(main_df, zone_df):
return main_df.merge(zone_df, left_on='Origin', right_on='ZONE_ID', how='left')
# Load and merge datasets
merged_data = {
"Chukyo": merge_zone_features(train_files['Chukyo'], zone_feature_files['Chukyo']),
"Higashisurugawan": merge_zone_features(train_files['Higashisurugawan'], zone_feature_files['Higashisurugawan']),
"Kyushu": merge_zone_features(train_files['Kyushu'], zone_feature_files['Kyushu']),
"Tokyo": merge_zone_features(train_files['Tokyo'], zone_feature_files['Tokyo'])
}
combined_data = pd.concat(merged_data.values(), axis=0)
# Top 100 most frequent destinations
top_destinations = combined_data['Destination'].value_counts().head(100).index
combined_data = combined_data[combined_data['Destination'].isin(top_destinations)]
# Impute missing values using the mean strategy for numerical columns
imputer = SimpleImputer(strategy='mean')
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']] = imputer.fit_transform(
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']]
)
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Occupation', 'Trip_type', 'Origin']:
le = LabelEncoder()
combined_data[col] = le.fit_transform(combined_data[col])
label_encoders[col] = le
for col in ['Gender', 'Occupation', 'Trip_type']:
# Get the labels seen during training
known_labels = set(label_encoders[col].classes_)
# Replace unseen labels with the most frequent label or a default value (e.g., -1 or 'unknown')
test_data[col] = test_data[col].apply(lambda x: x if x in known_labels else np.nan)
# Fill the NaN (unseen labels) with a default value (e.g., most frequent label seen during training)
test_data[col].fillna(label_encoders[col].classes_[0], inplace=True) # Replace with most frequent label
# Apply the label encoding
test_data[col] = label_encoders[col].transform(test_data[col])
# Encode the target variable 'Destination'
le_destination = LabelEncoder()
y = le_destination.fit_transform(combined_data['Destination'])
# Select features
features = ['Gender', 'Age', 'Occupation', 'Trip_type', 'T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']
X = combined_data[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(combined_data['Destination'].nunique()) # Check number of unique values in 'Destination'
print(combined_data['Destination'].value_counts().head(10)) # Inspect top 10 frequent classes
# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=64)
# Train XGBoost with GPU
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=64)
xgb.fit(X_train, y_train)
# Predict and evaluate
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
#skip this this is knn model
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_scaled, y, test_size=0.2, random_state=48)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test_knn)
accuracy = accuracy_score(y_test_knn, y_pred_knn)
conf_matrix = confusion_matrix(y_test_knn, y_pred_knn)
class_report = classification_report(y_test_knn, y_pred_knn)
print("Training Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
# SKip this
from sklearn.model_selection import cross_val_score
#XGBoost 5-fold cross-validation
xgb_cv = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=48)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring='accuracy')
print("XGBoost Cross-Validation Accuracy Scores:", scores)
print("Mean Cross-Validation Accuracy:", scores.mean())
import pandas as pd
y_test_pred = xgb.predict(X_test_scaled)
test_data['Destination'] = le_destination.inverse_transform(y_test_pred)
submission = test_data[['Trip_id', 'Destination']]
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved to {submission_file_path}")
7 评论