Data Science Fundamentals

Contents

Overview

Core Components

  • Data Collection and Cleaning
  • Exploratory Data Analysis (EDA)
  • Statistical Analysis
  • Machine Learning
  • Data Visualization
  • Communication of Results

Data Preparation

Data Cleaning with Pandas

# Python - Data Cleaning Example import pandas as pd import numpy as np def clean_dataset(df): # Remove duplicates df = df.drop_duplicates() # Handle missing values df['numeric_col'] = df['numeric_col'].fillna(df['numeric_col'].mean()) df['categorical_col'] = df['categorical_col'].fillna(df['categorical_col'].mode()[0]) # Remove outliers using IQR method Q1 = df['numeric_col'].quantile(0.25) Q3 = df['numeric_col'].quantile(0.75) IQR = Q3 - Q1 df = df[~((df['numeric_col'] < (Q1 - 1.5 * IQR)) | (df['numeric_col'] > (Q3 + 1.5 * IQR)))] # Normalize numeric columns df['normalized_col'] = (df['numeric_col'] - df['numeric_col'].mean()) / \ df['numeric_col'].std() return df

Feature Engineering

# Feature Engineering Example import pandas as pd from sklearn.preprocessing import LabelEncoder def engineer_features(df): # Create date-based features df['date_col'] = pd.to_datetime(df['date_col']) df['year'] = df['date_col'].dt.year df['month'] = df['date_col'].dt.month df['day_of_week'] = df['date_col'].dt.dayofweek # Encode categorical variables le = LabelEncoder() df['encoded_category'] = le.fit_transform(df['category_col']) # Create interaction features df['feature_interaction'] = df['feature1'] * df['feature2'] # Bin numerical features df['binned_feature'] = pd.qcut(df['numeric_col'], q=5, labels=['A','B','C','D','E']) return df

Statistical Analysis

Descriptive Statistics

# Python - Statistical Analysis import scipy.stats as stats def analyze_dataset(df): # Basic statistics basic_stats = df.describe() # Correlation analysis correlation_matrix = df.corr() # Hypothesis testing group1 = df[df['category'] == 'A']['value'] group2 = df[df['category'] == 'B']['value'] t_stat, p_value = stats.ttest_ind(group1, group2) # Chi-square test of independence contingency_table = pd.crosstab(df['category1'], df['category2']) chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) return { 'basic_stats': basic_stats, 'correlation': correlation_matrix, 't_test': {'statistic': t_stat, 'p_value': p_value}, 'chi2_test': {'statistic': chi2, 'p_value': p_value} }

Data Visualization

Matplotlib and Seaborn

# Python - Data Visualization import matplotlib.pyplot as plt import seaborn as sns def create_visualizations(df): # Set style plt.style.use('seaborn') sns.set_palette("husl") # Create subplots fig, axes = plt.subplots(2, 2, figsize=(15, 12)) # Distribution plot sns.histplot(data=df, x='numeric_col', kde=True, ax=axes[0,0]) axes[0,0].set_title('Distribution Plot') # Box plot sns.boxplot(data=df, x='category', y='value', ax=axes[0,1]) axes[0,1].set_title('Box Plot by Category') # Scatter plot with regression line sns.regplot(data=df, x='x_var', y='y_var', ax=axes[1,0]) axes[1,0].set_title('Scatter Plot with Regression') # Correlation heatmap sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=axes[1,1]) axes[1,1].set_title('Correlation Heatmap') plt.tight_layout() return fig

Machine Learning Basics

Supervised Learning Example

# Python - Basic ML Pipeline from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report def ml_pipeline(X, y): # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Train model model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train_scaled, y_train) # Make predictions y_pred = model.predict(X_test_scaled) # Evaluate model report = classification_report(y_test, y_pred) feature_importance = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) return { 'model': model, 'report': report, 'feature_importance': feature_importance }

Cross-Validation

from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV def optimize_model(X, y, model, param_grid): # Cross-validation cv_scores = cross_val_score( model, X, y, cv=5, scoring='accuracy' ) # Grid search grid_search = GridSearchCV( model, param_grid, cv=5, scoring='accuracy', n_jobs=-1 ) grid_search.fit(X, y) return { 'cv_scores': cv_scores, 'best_params': grid_search.best_params_, 'best_score': grid_search.best_score_ }

Essential Tools

Core Libraries

  • NumPy - Numerical Computing
  • Pandas - Data Manipulation
  • Scikit-learn - Machine Learning
  • Matplotlib/Seaborn - Visualization
  • SciPy - Scientific Computing
  • TensorFlow/PyTorch - Deep Learning

Development Environment

  • Jupyter Notebooks
  • Anaconda Distribution
  • Git for Version Control
  • Virtual Environments
  • Cloud Platforms (AWS, GCP, Azure)