文章9
标签11
分类3

SYS5405F_A1

import pandas as pd
import os

# change your path
data_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project'
test_data = pd.read_csv(r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\test.csv')
submission_file_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\submission.csv'

zone_feature_files_paths = {
    "Chukyo": os.path.join(data_path, "Zone_features", "Chukyo_zone_feature_area.csv"),
    "Higashisurugawan": os.path.join(data_path, "Zone_features", "Higashisurugawan_zone_feature_area.csv"),
    "Kinki": os.path.join(data_path, "Zone_features", "Kinki_zone_feature_area.csv"),
    "Kyushu": os.path.join(data_path, "Zone_features", "Kyushu_zone_feature_area.csv"),
    "Tokyo": os.path.join(data_path, "Zone_features", "Tokyo_zone_feature_area.csv")
}

train_files_paths = {
    "Chukyo": os.path.join(data_path, "train", "Chukyo.csv"),
    "Higashisurugawan": os.path.join(data_path, "train", "Higashisurugawan.csv"),
    "Kyushu": os.path.join(data_path, "train", "Kyushu.csv"),
    "Tokyo": os.path.join(data_path, "train", "Tokyo.csv")
}

# Load the data
zone_feature_files = {name: pd.read_csv(path) for name, path in zone_feature_files_paths.items()}
train_files = {name: pd.read_csv(path) for name, path in train_files_paths.items()}

# Missing values
def check_missing_values(dataframes):
    missing_values = {}
    for name, df in dataframes.items():
        missing = df.isnull().sum()
        missing_values[name] = missing[missing > 0]
    return missing_values

missing_zone_features = check_missing_values(zone_feature_files)
missing_train_data = check_missing_values(train_files)
missing_zone_features, missing_train_data

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def merge_zone_features(main_df, zone_df):
    return main_df.merge(zone_df, left_on='Origin', right_on='ZONE_ID', how='left')

# Load and merge datasets
merged_data = {
    "Chukyo": merge_zone_features(train_files['Chukyo'], zone_feature_files['Chukyo']),
    "Higashisurugawan": merge_zone_features(train_files['Higashisurugawan'], zone_feature_files['Higashisurugawan']),
    "Kyushu": merge_zone_features(train_files['Kyushu'], zone_feature_files['Kyushu']),
    "Tokyo": merge_zone_features(train_files['Tokyo'], zone_feature_files['Tokyo'])
}

combined_data = pd.concat(merged_data.values(), axis=0)

# Top 100 most frequent destinations
top_destinations = combined_data['Destination'].value_counts().head(100).index
combined_data = combined_data[combined_data['Destination'].isin(top_destinations)]

# Impute missing values using the mean strategy for numerical columns
imputer = SimpleImputer(strategy='mean')
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']] = imputer.fit_transform(
    combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']]
)
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Occupation', 'Trip_type', 'Origin']: 
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])
    label_encoders[col] = le

for col in ['Gender', 'Occupation', 'Trip_type']:
    # Get the labels seen during training
    known_labels = set(label_encoders[col].classes_)
    
    # Replace unseen labels with the most frequent label or a default value (e.g., -1 or 'unknown')
    test_data[col] = test_data[col].apply(lambda x: x if x in known_labels else np.nan)
    
    # Fill the NaN (unseen labels) with a default value (e.g., most frequent label seen during training)
    test_data[col].fillna(label_encoders[col].classes_[0], inplace=True)  # Replace with most frequent label
    
    # Apply the label encoding
    test_data[col] = label_encoders[col].transform(test_data[col])

# Encode the target variable 'Destination'
le_destination = LabelEncoder()
y = le_destination.fit_transform(combined_data['Destination'])

# Select features
features = ['Gender', 'Age', 'Occupation', 'Trip_type', 'T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']
X = combined_data[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(combined_data['Destination'].nunique())  # Check number of unique values in 'Destination'
print(combined_data['Destination'].value_counts().head(10))  # Inspect top 10 frequent classes

# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=64)

# Train XGBoost with GPU
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=64)
xgb.fit(X_train, y_train)
# Predict and evaluate
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
#skip this this is knn model

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_scaled, y, test_size=0.2, random_state=48)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test_knn)
accuracy = accuracy_score(y_test_knn, y_pred_knn)
conf_matrix = confusion_matrix(y_test_knn, y_pred_knn)
class_report = classification_report(y_test_knn, y_pred_knn)

print("Training Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
# SKip this

from sklearn.model_selection import cross_val_score

#XGBoost 5-fold cross-validation
xgb_cv = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=48)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring='accuracy')

print("XGBoost Cross-Validation Accuracy Scores:", scores)
print("Mean Cross-Validation Accuracy:", scores.mean())

import pandas as pd
y_test_pred = xgb.predict(X_test_scaled)

test_data['Destination'] = le_destination.inverse_transform(y_test_pred)

submission = test_data[['Trip_id', 'Destination']]
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")

此内容被密码保护

请输入密码访问

">
召唤看板娘