Common Data Types
\\nThis chapter will introduce you to the four most common data types in machine learning: numerical, text, image, and categorical data.
\\nData types are like types of ingredients, different ingredients require different processing methods. Similarly, different types of data also require different processing techniques and algorithms.
\\nFour Major Data Type Categories
\\nNumerical Data
\\nWhat is Numerical Data?
\\nNumerical data is like the result of measuring with a ruler, it can be used for mathematical operations and is the most common data type in machine learning.
\\nClassification of Numerical Data
\\n1. Continuous Numerical Data
\\nExample
\\n# Continuous numerical data example\\n\\nimport numpy as np\\n\\nimport pandas as pd\\n\\nimport matplotlib.pyplot as plt\\n\\ndef continuous_data_example():\\n """Continuous numerical data example"""\\n print("=== Continuous numerical data example ===")\\n \\n # Generate continuous data\\n np.random.seed(42)\\n n_samples = 1000\\n \\n # Height data (continuous)\\n heights = np.random.normal(170, 10, n_samples) # mean 170, std 10\\n weights = heights * 0.7 + np.random.normal(0, 5, n_samples) # weight correlated with height\\n temperatures = np.random.normal(36.5, 0.5, n_samples) # body temperature\\n \\n # Create dataframe\\n continuous_data = pd.DataFrame({\\n 'Height (cm)': heights,\\n 'Weight (kg)': weights,\\n 'Body Temperature (Β°C)': temperatures,\\n 'Age': np.random.randint(18, 65, n_samples)\\n })\\n \\n print("Continuous data example:")\\n print(continuous_data.head())\\n \\n print(f"n Data statistics:")\\n print(continuous_data.describe())\\n \\n # Visualize continuous data distribution\\n plt.figure(figsize=(12, 8))\\n \\n plt.subplot(2, 2, 1)\\n plt.hist(continuous_data['Height (cm)'], bins=30, alpha=0.7, color='skyblue')\\n plt.title('Height Distribution')\\n plt.xlabel('Height (cm)')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 2)\\n plt.hist(continuous_data['Weight (kg)'], bins=30, alpha=0.7, color='lightgreen')\\n plt.title('Weight Distribution')\\n plt.xlabel('Weight (kg)')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 3)\\n plt.hist(continuous_data['Body Temperature (Β°C)'], bins=30, alpha=0.7, color='salmon')\\n plt.title('Body Temperature Distribution')\\n plt.xlabel('Body Temperature (Β°C)')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 4)\\n plt.scatter(continuous_data['Height (cm)'], continuous_data['Weight (kg)'], alpha=0.6)\\n plt.title('Height vs Weight')\\n plt.xlabel('Height (cm)')\\n plt.ylabel('Weight (kg)')\\n \\n plt.tight_layout()\\n plt.show()\\n \\n return continuous_data\\n\\n# Run example\\ncontinuous_df = continuous_data_example()\\n\\n2. Discrete Numerical Data
\\nExample
\\n# Discrete numerical data example\\n\\ndef discrete_data_example():\\n """Discrete numerical data example"""\\n print("n=== Discrete numerical data example ===")\\n \\n # Generate discrete data\\n np.random.seed(42)\\n n_samples = 500\\n \\n # Discrete data\\n customer_count = np.random.poisson(10, n_samples) # Poisson distribution: customer count\\n product_rating = np.random.randint(1, 6, n_samples) # 1-5 star rating\\n defect_count = np.random.binomial(20, 0.1, n_samples) # Binomial distribution: defect count\\n call_duration = np.random.exponential(5, n_samples) * 60 # Exponential distribution: call duration (seconds)\\n \\n # Create dataframe\\n discrete_data = pd.DataFrame({\\n 'Customer Count': customer_count,\\n 'Product Rating': product_rating,\\n 'Defect Count': defect_count,\\n 'Call Duration(Second)': call_duration.astype(int)\\n })\\n \\n print("Discrete data example:")\\n print(discrete_data.head())\\n \\n print(f"n Data statistics:")\\n print(discrete_data.describe())\\n \\n # Visualize discrete data\\n plt.figure(figsize=(12, 8))\\n \\n plt.subplot(2, 2, 1)\\n plt.hist(discrete_data['Customer Count'], bins=range(0, max(discrete_data['Customer Count'])+2), \\n alpha=0.7, color='orange')\\n plt.title('Customer Count Distribution')\\n plt.xlabel('Customer Count')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 2)\\n value_counts = discrete_data['Product Rating'].value_counts().sort_index()\\n plt.bar(value_counts.index, value_counts.values, color='purple', alpha=0.7)\\n plt.title('Product Rating Distribution')\\n plt.xlabel('Rating')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 3)\\n plt.hist(discrete_data['Defect Count'], bins=range(0, max(discrete_data['Defect Count'])+2), \\n alpha=0.7, color='red')\\n plt.title('Defect CountDistribution')\\n plt.xlabel('Defect Count')\\n plt.ylabel('Frequency')\\n \\n plt.subplot(2, 2, 4)\\n plt.hist(discrete_data['Call Duration(Second)'], bins=30, alpha=0.7, color='brown')\\n plt.title('Call Duration Distribution')\\n plt.xlabel('Call Duration (Second)')\\n plt.ylabel('Frequency')\\n \\n plt.tight_layout()\\n plt.show()\\n \\n return discrete_data\\n\\n# Run example\\ndiscrete_df = discrete_data_example()\\n\\nProcessing Methods for Numerical Data
\\nExample
\\n# Numerical data processing methods\\n\\nclass NumericDataProcessor:\\n \\n def __init__(self):\\n self.scalers = {}\\n self.transformers = {}\\n \\n def detect_outliers(self, data, method='iqr'):\\n """Detect outliers"""\\n outliers_info = {}\\n \\n for column in data.select_dtypes(include=[np.number]).columns:\\n if method == 'iqr':\\n Q1 = data.quantile(0.25)\\n Q3 = data.quantile(0.75)\\n IQR = Q3 - Q1\\n lower_bound = Q1 - 1.5 * IQR\\n upper_bound = Q3 + 1.5 * IQR\\n \\n outliers = data[(data upper_bound)]\\n elif method == 'zscore':\\n z_scores = np.abs((data - data.mean()) / data.std())\\n outliers = data[z_scores > 3]\\n \\n outliers_info = {\\n 'count': len(outliers),\\n 'indices': outliers.index.tolist(),\\n 'percentage': (len(outliers) / len(data)) * 100\\n }\\n \\n return outliers_info\\n \\n def handle_missing_values(self, data, strategy='mean'):\\n """Handle missing values"""\\n processed_data = data.copy()\\n \\n for column in processed_data.select_dtypes(include=[np.number]).columns:\\n if processed_data.isnull().sum() > 0:\\n if strategy == 'mean':\\n processed_data.fillna(processed_data.mean(), inplace=True)\\n elif strategy == 'median':\\n processed_data.fillna(processed_data.median(), inplace=True)\\n elif strategy == 'mode':\\n processed_data.fillna(processed_data.mode(), inplace=True)\\n elif strategy == 'forward':\\n processed_data.fillna(method='ffill', inplace=True)\\n elif strategy == 'backward':\\n processed_data.fillna(method='bfill', inplace=True)\\n \\n return processed_data\\n \\n def normalize_data(self, data, method='minmax'):\\n """Data normalization"""\\n from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler\\n \\n processed_data = data.copy()\\n numeric_columns = data.select_dtypes(include=[np.number]).columns\\n \\n if method == 'minmax':\\n scaler = MinMaxScaler()\\n elif method == 'standard':\\n scaler = StandardScaler()\\n elif method == 'robust':\\n scaler = RobustScaler()\\n else:\\n raise ValueError("Method must be 'minmax', 'standard', or 'robust'")\\n \\n processed_data = scaler.fit_transform(processed_data)\\n self.scalers = scaler\\n \\n return processed_data\\n \\n def create_features(self, data):\\n """Feature engineering"""\\n processed_data = data.copy()\\n numeric_columns = data.select_dtypes(include=[np.number]).columns\\n \\n # Create polynomial features\\n if len(numeric_columns) >= 2:\\n col1, col2 = numeric_columns, numeric_columns\\n processed_data[f'{col1}_x_{col2}'] = data * data\\n processed_data[f'{col1}_div_{col2}'] = data / (data + 1e-8)\\n \\n # Create statistical features\\n for column in numeric_columns:\\n processed_data[f'{column}_log'] = np.log1p(data)\\n processed_data[f'{column}_sqrt'] = np.sqrt(np.abs(data))\\n processed_data[f'{column}_square'] = data ** 2\\n \\n return processed_data\\n\\n# Usage example\\nprocessor = NumericDataProcessor()\\n\\n# Detect outliers\\noutliers = processor.detect_outliers(continuous_df)\\nprint("n Outlier detection results:")\\nfor column, info in outliers.items():\\n if info['count'] > 0:\\n print(f"{column}: {info['count']} outliers ({info['percentage']:.2f}%)")\\n\\n# Data normalization\\nnormalized_data = processor.normalize_data(continuous_df, method='standard')\\nprint("n Normalized data example:")\\nprint(normalized_data.head())\\n\\n\\n
Text Data
\\nWhat is Text Data?
\\nText data is like expressions of human language, containing rich semantic information, but requiring special processing to be used by machine learning models.
\\nClassification of Text Data
\\n1. Structured Text Data
\\nExample
\\n# Structured text data example\\n\\nimport pandas as pd\\nimport re\\nfrom collections import Counter\\n\\ndef structured_text_example():\\n """Structured text data example"""\\n print("n=== Structured text data example ===")\\n \\n # Create structured text data\\n structured_data = pd.DataFrame({\\n 'Email ID': range(1, 11),\\n 'Sender': [\\n 'zhangsan@example.com', 'lisi@company.com', 'wangwu@service.com',\\n 'zhaoliu@business.com', 'qianqi@personal.com', 'sunba@tech.com',\\n 'zhoujiu@edu.com', 'wushi@org.com', 'zhengyi@gov.com',\\n 'chener@health.com'\\n ],\\n 'Theme': [\\n 'Meeting Notice: Meeting Tomorrow at 3 PM',\\n 'Product Quotation: Latest Price List',\\n 'Customer Feedback: Service Satisfaction Survey',\\n 'Project Progress: Phase 1 Completed',\\n 'Holiday Schedule: National Day Holiday Notice',\\n 'Technical Update: System Upgrade Announcement',\\n 'Academic Conference: Call for Papers Notice',\\n 'Training Notice: New Employee Training',\\n 'Policy Document: Latest Regulations',\\n 'Health Reminder: Medical Examination Notice'\\n ],\\n 'Content Length': [156, 234, 189, 145, 98, 267, 198, 134, 312, 87]\\n })\\n \\n print("Structured text data example:")\\n print(structured_data)\\n \\n # Text feature extraction\\n print("n=== Text feature analysis ===")\\n \\n # Email domain analysis\\n domains = [email.split('@') for email in structured_data['Sender']]\\n domain_counts = Counter(domains)\\n print(f"Email domain distribution: {dict(domain_counts)}")\\n \\n # Subject keyword analysis\\n all_words = []\\n for subject in structured_data['Theme']:\\n words = re.findall(r'+', subject) # Extract Chinese words\\n all_words.extend(words)\\n word_counts = Counter(all_words)\\n print(f"Subject word frequency: {dict(word_counts)}")\\n \\n # Content length statistics\\n print(f"Content length statistics:")\\n print(structured_data['Content Length'].describe())\\n
YouTip