Python 数据科学高级应用指南

张开发
2026/6/27 9:42:45 15 分钟阅读
Python 数据科学高级应用指南
Python 数据科学高级应用指南1. 数据科学基础数据科学是一门融合了统计学、计算机科学和领域知识的学科它使用各种技术和方法来从数据中提取有价值的信息。Python 是数据科学领域最流行的编程语言之一拥有丰富的库和工具。import numpy as np import pandas as pd import matplotlib.pyplot as plt # 创建示例数据 data pd.DataFrame({ age: [25, 30, 35, 40, 45], income: [50000, 60000, 70000, 80000, 90000] }) # 数据可视化 plt.scatter(data[age], data[income]) plt.title(Age vs Income) plt.xlabel(Age) plt.ylabel(Income) plt.show()2. 数据预处理2.1 数据清洗import pandas as pd import numpy as np # 创建包含缺失值的数据 data pd.DataFrame({ age: [25, np.nan, 35, 40, 45], income: [50000, 60000, np.nan, 80000, 90000], gender: [M, F, M, F, M] }) # 查看缺失值 print(data.isnull()) # 填充缺失值 data[age].fillna(data[age].mean(), inplaceTrue) data[income].fillna(data[income].median(), inplaceTrue) # 查看处理后的数据 print(data)2.2 特征工程import pandas as pd from sklearn.preprocessing import StandardScaler, OneHotEncoder # 创建数据 data pd.DataFrame({ age: [25, 30, 35, 40, 45], income: [50000, 60000, 70000, 80000, 90000], gender: [M, F, M, F, M] }) # 标准化数值特征 scaler StandardScaler() data[[age, income]] scaler.fit_transform(data[[age, income]]) # 编码分类特征 encoder OneHotEncoder(dropfirst, sparseFalse) gender_encoded encoder.fit_transform(data[[gender]]) gender_df pd.DataFrame(gender_encoded, columns[gender_M]) data pd.concat([data, gender_df], axis1) data.drop(gender, axis1, inplaceTrue) # 查看处理后的数据 print(data)3. 机器学习3.1 监督学习from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import pandas as pd import numpy as np # 创建数据 data pd.DataFrame({ age: [25, 30, 35, 40, 45, 50, 55, 60], income: [50000, 60000, 70000, 80000, 90000, 95000, 98000, 100000] }) # 准备特征和标签 X data[[age]] y data[income] # 拆分数据 X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42) # 训练模型 model LinearRegression() model.fit(X_train, y_train) # 预测 y_pred model.predict(X_test) # 评估 mse mean_squared_error(y_test, y_pred) print(fMean Squared Error: {mse}) # 查看模型系数 print(fCoefficient: {model.coef_[0]}) print(fIntercept: {model.intercept_})3.2 无监督学习from sklearn.cluster import KMeans import pandas as pd import numpy as np import matplotlib.pyplot as plt # 创建数据 np.random.seed(42) data pd.DataFrame({ x: np.random.normal(0, 1, 100), y: np.random.normal(0, 1, 100) }) # 训练 K-Means 模型 kmeans KMeans(n_clusters3, random_state42) data[cluster] kmeans.fit_predict(data[[x, y]]) # 可视化聚类结果 plt.scatter(data[x], data[y], cdata[cluster], cmapviridis) plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s100, cred) plt.title(K-Means Clustering) plt.show()4. 深度学习4.1 神经网络基础import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense import numpy as np # 创建数据 X np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) y np.array([[0], [1], [1], [0]]) # 创建模型 model Sequential([ Dense(4, activationrelu, input_shape(2,)), Dense(1, activationsigmoid) ]) # 编译模型 model.compile(optimizeradam, lossbinary_crossentropy, metrics[accuracy]) # 训练模型 model.fit(X, y, epochs1000, verbose0) # 预测 predictions model.predict(X) print(predictions)4.2 卷积神经网络import tensorflow as tf from tensorflow.keras.datasets import mnist from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten # 加载数据 (x_train, y_train), (x_test, y_test) mnist.load_data() # 数据预处理 x_train x_train.reshape(-1, 28, 28, 1) / 255.0 x_test x_test.reshape(-1, 28, 28, 1) / 255.0 y_train tf.keras.utils.to_categorical(y_train, 10) y_test tf.keras.utils.to_categorical(y_test, 10) # 创建模型 model Sequential([ Conv2D(32, (3, 3), activationrelu, input_shape(28, 28, 1)), MaxPooling2D((2, 2)), Conv2D(64, (3, 3), activationrelu), MaxPooling2D((2, 2)), Flatten(), Dense(128, activationrelu), Dense(10, activationsoftmax) ]) # 编译模型 model.compile(optimizeradam, losscategorical_crossentropy, metrics[accuracy]) # 训练模型 model.fit(x_train, y_train, epochs5, batch_size32, validation_split0.2) # 评估模型 loss, accuracy model.evaluate(x_test, y_test) print(fTest accuracy: {accuracy})5. 实际应用场景5.1 预测分析import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt # 加载数据 data pd.read_csv(house_prices.csv) # 准备特征和标签 X data[[sqft_living, bedrooms, bathrooms]] y data[price] # 拆分数据 X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42) # 训练模型 model LinearRegression() model.fit(X_train, y_train) # 预测 y_pred model.predict(X_test) # 可视化预测结果 plt.scatter(y_test, y_pred) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], k--, lw2) plt.xlabel(Actual Price) plt.ylabel(Predicted Price) plt.title(Actual vs Predicted House Prices) plt.show()5.2 自然语言处理import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans # 示例文本 texts [ Python is a popular programming language, Java is used for enterprise applications, C is a compiled language, Python is used for data science, Java is used for Android development, C is used for system programming ] # 文本预处理 nltk.download(punkt) nltk.download(stopwords) stop_words set(stopwords.words(english)) processed_texts [] for text in texts: tokens word_tokenize(text.lower()) filtered_tokens [token for token in tokens if token.isalpha() and token not in stop_words] processed_texts.append( .join(filtered_tokens)) # 特征提取 vectorizer TfidfVectorizer() X vectorizer.fit_transform(processed_texts) # 聚类 kmeans KMeans(n_clusters3, random_state42) clusters kmeans.fit_predict(X) # 查看结果 for i, text in enumerate(texts): print(fText: {text}) print(fCluster: {clusters[i]}) print()6. 最佳实践数据质量确保数据质量处理缺失值和异常值。特征工程合理选择和处理特征提高模型性能。模型选择根据问题类型选择合适的模型。模型评估使用合适的评估指标评估模型性能。超参数调优通过交叉验证等方法调优模型超参数。可解释性关注模型的可解释性理解模型决策过程。7. 总结Python 是数据科学领域的强大工具它提供了丰富的库和工具从数据处理到机器学习和深度学习。通过掌握这些工具的高级应用我们可以从数据中提取有价值的信息解决复杂的问题。在实际应用中我们可以使用 Python 进行预测分析、自然语言处理、图像处理等多种任务为业务决策提供支持。希望本文对你理解和应用 Python 数据科学有所帮助

更多文章