import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.linear_model import LogisticRegression # 逻辑回归 from sklearn.tree import DecisionTreeClassifier # 决策树 from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score import joblib # pandas的显示设置来增加可以显示的列数 pd.set_option('display.max_columns', None) # 读取数据 data = pd.read_csv('creditcard.csv') # 查看默认的前5行数据 data.head(5) # 查看数据的信息 print(data.shape) data.info() # 查看数据的描述 data.describe() # 检查是否有空置 data.isnull().sum() # 查看没类的个数 data['Class'].value_counts() # 划分特征和标签 X = data.drop('Class', axis=1) y = data['Class'] # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 特征标准化 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print("原始数据训练-----") # 使用逻辑回归算法进行训练和评估 print("\nLogistic Regression:") # 初始化逻辑回归模型 model = LogisticRegression(max_iter=1000, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') print("\nDecisionTreeClassifier:") # 初始化决策树模型 model = DecisionTreeClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') from imblearn.under_sampling import RandomUnderSampler print("下采样以平衡数据") rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample(X, y) # 分割数据为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42) # 特征缩放 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # 使用逻辑回归算法进行训练和评估 print("\nLogistic Regression:") # 初始化逻辑回归模型 model = LogisticRegression(max_iter=1000, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') print("\nDecisionTreeClassifier:") # 初始化决策树模型 model = DecisionTreeClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') from imblearn.over_sampling import SMOTE print("上采样以平衡数据") smote = SMOTE(random_state=42) X_res, y_res = smote.fit_resample(X, y) # 分割数据为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42) # 特征缩放 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # 模型调优 print("\nDecisionTreeClassifier:") # 初始化决策树模型 model = DecisionTreeClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') # 使用逻辑回归算法进行训练和评估 print("\nLogistic Regression:") # 初始化逻辑回归模型 model = LogisticRegression(max_iter=1000, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) # 打印性能指标 print(f'Accuracy: {accuracy_score(y_test, y_pred)}') print(f'Precision: {precision_score(y_test, y_pred)}') print(f'Recall: {recall_score(y_test, y_pred)}') print(f'F1 Score: {f1_score(y_test, y_pred)}') from sklearn.model_selection import learning_curve # 获取学习曲线数据 train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)) # 计算训练和测试分数的平均值与标准差 train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # 绘制学习曲线 plt.figure() plt.title("Learning Curve") plt.xlabel("Training Examples") plt.ylabel("Score") plt.grid() plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation Score") plt.legend(loc="best") plt.show() # 加载逻辑回归模型 logistic_model = joblib.load('logistic_regression_model.pkl') # 加载决策树模型 decision_tree_model = joblib.load('decision_tree_model.pkl')