Ml Python Libs
This chapter will introduce you in detail to the four core Python libraries in machine learning: NumPy, Pandas, Matplotlib, and Scikit-learn.
**Machine learning libraries are like a professional toolbox**, each library has a specific purpose, and using them together can complete complex machine learning tasks.
!(#)
### Roles of the Four Core Libraries
* **(#)**: The foundation of numerical computing, providing efficient array operations
* **(#)**: A powerful tool for data processing, providing data structures and analysis tools
* **(#)**: The brush for data visualization, creating various charts
* **(#)**: The Swiss Army knife of machine learning, providing a complete ML toolchain
* * *
## NumPy: The Foundation of Numerical Computing
### What is NumPy?
**NumPy is like a calculator for mathematical computations**, but infinitely more powerful. It is the foundational library for scientific computing in Python, providing efficient multi-dimensional array objects.
### Core Concepts of NumPy
#### 1. Array
## Example
# NumPy Array Basic Operations
import numpy as np
# Different ways to create arrays
print("=== NumPy Array Creation ===")
# Create from list
arr1 = np.array([1,2,3,4,5])
print(f"Created from list: {arr1}")
# Create arithmetic array
arr2 = np.arange(0,10,2)# 0 to 10, step 2
print(f"Arithmetic array: {arr2}")
# Create evenly spaced array
arr3 = np.linspace(0,1,5)# 0 to 1, 5 points
print(f"Evenly spaced array: {arr3}")
# Create special arrays
zeros_arr = np.zeros((2,3))# 2x3 zero array
ones_arr = np.ones((2,3))# 2x3 ones array
identity_arr = np.eye(3)# 3x3 identity matrix
print(f"Zero array:n{zeros_arr}")
print(f"Ones array:n{ones_arr}")
print(f"Identity matrix:n{identity_arr}")
#### 2. Array Operations
## Example
# Basic Array Operations
print("n=== Array Basic Operations ===")
# Array attributes
arr = np.array([[1,2,3],[4,5,6]])
print(f"Array:n{arr}")
print(f"Shape: {arr.shape}")
print(f"Dimensions: {arr.ndim}")
print(f"Number of elements: {arr.size}")
print(f"Data type: {arr.dtype}")
# Array indexing and slicing
print(f"First row: {arr}")
print(f"First column: {arr[:, 0]}")
print(f"Element [1,2]: {arr[1, 2]}")
# Array operations
arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])
print(f"Addition: {arr1 + arr2}")
print(f"Multiplication: {arr1 * arr2}")
print(f"Dot product: {np.dot(arr1, arr2)}")
# Statistical functions
data = np.array([1,2,3,4,5,6,7,8,9,10])
print(f"Mean: {np.mean(data)}")
print(f"Standard deviation: {np.std(data)}")
print(f"Maximum: {np.max(data)}")
print(f"Minimum: {np.min(data)}")
print(f"Median: {np.median(data)}")
#### NumPy Practical Application Example
## Example
# NumPy Practical Application: Simple Linear Regression
def numpy_linear_regression():
"""Implement simple linear regression using NumPy"""
# Generate sample data
np.random.seed(42)
X =2 * np.random.rand(100,1)# Features
y =4 + 3 * X + np.random.randn(100,1)# Labels + noise
# Add x0 = 1 to X
X_b = np.c_[np.ones((100,1)), X]# Add bias term
# Solve using normal equation: ΞΈ = (X^T * X)^(-1) * X^T * y
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
print("=== NumPy Linear Regression Example ===")
print(f"Learned parameters: intercept={theta_best:.2f}, slope={theta_best:.2f}")
# Prediction
X_new = np.array([,])
X_new_b = np.c_[np.ones((2,1)), X_new]
y_predict = X_new_b.dot(theta_best)
print(f"Predictions: when X=0, y={y_predict:.2f}; when X=2, y={y_predict:.2f}")
return theta_best, X, y
# Run example
theta, X, y = numpy_linear_regression()
* * *
## Pandas: A Powerful Tool for Data Processing
### What is Pandas?
**Pandas is like a Swiss Army knife for data processing**, providing powerful data structures and data analysis tools, especially suitable for handling tabular data.
### Core Data Structures of Pandas
#### 1. Series (One-dimensional Data)
## Example
# Pandas Series Basic Operations
import pandas as pd
print("=== Pandas Series ===")
# Create Series from list
s1 = pd.Series([1,2,3,4,5])
print(f"Created from list:n{s1}")
# Series with index
s2 = pd.Series([10,20,30], index=['a','b','c'])
print(f"nSeries with index:n{s2}")
# Create Series from dictionary
s3 = pd.Series({'Math': 90,'English': 85,'Physics': 88})
print(f"nCreated from dictionary:n{s3}")
# Series operations
print(f"nAccess element: s2['b'] = {s2['b']}")
print(f"Slicing: s2[0:2] =n{s2[0:2]}")
print(f"Statistics:n{s2.describe()}")
#### 2. DataFrame (Two-dimensional Data)
## Example
# Pandas DataFrame Basic Operations
print("n=== Pandas DataFrame ===")
# Create DataFrame
data ={
'Name': ['Zhang San','Li Si','Wang Wu','Zhao Liu'],
'Age': [25,30,35,28],
'City': ['Beijing','Shanghai','Guangzhou','Shenzhen'],
'Salary': [15000,20000,18000,22000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
# DataFrame basic operations
print(f"nDataFrame shape: {df.shape}")
print(f"nColumn names: {list(df.columns)}")
print(f"nData types:n{df.dtypes}")
# Select data
print(f"nSelect 'Name' column:n{df['Name']}")
print(f"nSelect first two rows:n{df.head(2)}")
print(f"nSelect rows where age > 28:n{df[df['Age'] > 28]}")
# Statistics
print(f"nStatistics for numeric columns:n{df.describe()}")
# Add new column
df['Annual Salary']= df['Salary'] * 12
print(f"nAfter adding annual salary column:n{df}")
#### Pandas Data Processing Example
## Example
# Pandas Data Processing Complete Example
def pandas_data_processing():
"""Demonstrate complete Pandas data processing workflow"""
print("=== Pandas Data Processing Example ===")
# 1. Create sample data
np.random.seed(42)
n_samples =1000
data ={
'StudentID': range(1, n_samples + 1),
'Name': [f'Student{i}'for i in range(1, n_samples + 1)],
'Age': np.random.randint(18,25, n_samples),
'Gender': np.random.choice(['Male','Female'], n_samples),
'MathScore': np.random.normal(75,15, n_samples),
'EnglishScore': np.random.normal(80,12, n_samples),
'PhysicsScore': np.random.normal(72,18, n_samples),
'Class': np.random.choice(['Class 1','Class 2','Class 3'], n_samples)
}
df = pd.DataFrame(data)
# 2. Data cleaning
print("Original data shape:", df.shape)
# Handle outliers (scores should be between 0-100)
score_columns =['MathScore','EnglishScore','PhysicsScore']
for col in score_columns:
df= df.clip(0,100)
# 3. Feature engineering
# Calculate total and average scores
df['TotalScore']= df.sum(axis=1)
df['AverageScore']= df.mean(axis=1)
# Add grade
def get_grade(score):
if score >=90:
return'A'
elif score >=80:
return'B'
elif score >=70:
return'C'
elif score >=60:
return'D'
else:
return'F'
df['Grade']= df['AverageScore'].apply(get_grade)
# 4. Data analysis
print("n=== Data Analysis Results ===")
# Basic statistics
print("Average scores by subject:")
print(df.mean())
# Analysis by class
print("nAverage scores by class:")
class_avg = df.groupby('Class')['AverageScore'].mean()
print(class_avg)
# Analysis by gender
print("nGender distribution:")
gender_count = df['Gender'].value_counts()
print(gender_count)
# Grade distribution
print("nGrade distribution:")
grade_dist = df['Grade'].value_counts().sort_index()
print(grade_dist)
# 5. Data filtering
print("n=== Specific Data Filtering ===")
# Excellent students (average score > 85)
excellent_students = df[df['AverageScore']>85].head(5)
print("Excellent students (top 5):")
print(excellent_students[['Name','AverageScore','Grade']])
# Highest scoring student in each class
print("nHighest scoring student in each class:")
top_students = df.loc[df.groupby('Class')['AverageScore'].idxmax()]
print(top_students[['Class','Name','AverageScore']])
return df
# Run example
student_df = pandas_data_processing()
* * *
## Matplotlib: The Brush for Data Visualization
### What is Matplotlib?
**Matplotlib is like an data artist's brush**, capable of transforming dull data into intuitive charts, helping us understand patterns and relationships in data.
### Matplotlib Basic Charts
## Example
# Matplotlib Basic Chart Examples
import matplotlib.pyplot as plt
import numpy as np
# Set Chinese font (prevent Chinese from displaying as squares)
plt.rcParams['font.sans-serif']=['SimHei','Arial Unicode MS']
plt.rcParams['axes.unicode_minus']=False
def matplotlib_basic_charts():
"""Demonstrate Matplotlib basic charts"""
print("=== Matplotlib Basic Chart Examples ===")
# 1. Line chart
plt.figure(figsize=(12,8))
plt.subplot(2,3,1)
x = np.linspace(0,10,100)
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.title('Trigonometric Functions')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
# 2. Scatter plot
plt.subplot(2,3,2)
np.random.seed(42)
x = np.random.randn(100)
y =2 * x + np.random.randn(100) * 0.5
plt.scatter(x, y, alpha=0.6, c='blue')
plt.title('Scatter Plot')
plt.xlabel('X')
plt.ylabel('Y')
# 3. Bar chart
plt.subplot(2,3,3)
categories =['A','B','C','D','E']
values =[23,45,56,78,32]
plt.bar(categories, values, color=['red','green','blue','orange','purple'])
plt.title('Bar Chart')
plt.xlabel('Category')
plt.ylabel('Value')
# 4. Histogram
plt.subplot(2,3,4)
data = np.random.normal(100,15,1000)
plt.hist(data, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
# 5. Pie chart
plt.subplot(2,3,5)
sizes =[30,25,20,15,10]
labels =['A','B','C','D','E']
colors =['gold','lightcoral','lightskyblue','lightgreen','plum']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Pie Chart')
# 6. Box plot
plt.subplot(2,3,6)
data1 = np.random.normal(0,1,100)
data2 = np.random.normal(2,1,100)
data3 = np.random.normal(-2,1,100)
plt.boxplot([data1, data2, data3], labels=['Group 1','Group 2','Group 3'])
plt.title('Box Plot')
plt.ylabel('Value')
plt.tight_layout()
plt.show()
print("Chart displayed!")
# Run example
matplotlib_basic_charts()
#### Advanced Visualization Example
## Example
# Advanced Visualization Example
def advanced_visualization():
"""Demonstrate advanced visualization techniques"""
print("=== Advanced Visualization Example ===")
# Create more complex data
np.random.seed(42)
n_points =200
# Generate correlated data
x = np.random.randn(n_points)
y =2 *
YouTip