{ "cells": [ { "cell_type": "markdown", "id": "1cb3d1e8-f930-4777-816f-bc5379118487", "metadata": {}, "source": [ "# Data Analysis Fundamentals: 1D Data Exploration" ] }, { "cell_type": "markdown", "id": "bed06ecb-5cc0-4978-bf46-a08361b8f4c1", "metadata": {}, "source": [ "This notebook introduces fundamental concepts in analyzing one-dimensional datasets. We'll explore:\n", "- Basic NumPy operations\n", "- Visualizing distributions with histograms and KDEs\n", "- Statistical summaries and outlier detection\n", "- Distribution transformations" ] }, { "attachments": {}, "cell_type": "markdown", "id": "00a76bba-6439-44df-9bc8-b82f04bc5548", "metadata": {}, "source": [ "## 1. Setup and Installation\n", "\n", "If you haven't installed the required packages, run:" ] }, { "cell_type": "code", "execution_count": null, "id": "24ce4246-3918-4b93-888d-45152480022f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "!pip install numpy scipy matplotlib seaborn" ] }, { "cell_type": "code", "execution_count": null, "id": "bdc7ffa5-1171-489d-a0ea-529e1c9a32b2", "metadata": {}, "outputs": [], "source": [ "# Import necessary libraries\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy import stats\n", "\n", "# Set random seed for reproducibility\n", "np.random.seed(42)" ] }, { "cell_type": "markdown", "id": "5f338d7a-7bf7-451e-ae35-c12974f88479", "metadata": {}, "source": [ "## 2. NumPy Arrays Basics\n", "\n", "NumPy arrays are the foundation of numerical computing in Python. Let's explore different ways to create and manipulate them." ] }, { "cell_type": "code", "execution_count": null, "id": "16636eb3-9da0-451f-931e-d4222bc86bf1", "metadata": {}, "outputs": [], "source": [ "# Different ways to create arrays\n", "arr_from_list = np.array([1, 2, 3, 4, 5])\n", "arr_range = np.arange(0, 10, 2) # start, stop, step\n", "arr_linspace = np.linspace(0, 1, 11) # start, stop, num_points\n", "arr_random = np.random.randn(100) # 100 samples from standard normal\n", "\n", "print(\"From list:\", arr_from_list)\n", "print(\"Using arange:\", arr_range)\n", "print(\"Using linspace:\", arr_linspace[:5], \"...\") # showing first 5\n", "print(\"Random array shape:\", arr_random.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "af6fff06-af9d-4501-ba6b-2e064e159af0", "metadata": {}, "outputs": [], "source": [ "print(arr_random)" ] }, { "cell_type": "code", "execution_count": null, "id": "65b2b832-973f-4387-93ae-d07eba76910b", "metadata": {}, "outputs": [], "source": [ "# Vectorization demonstration\n", "import time\n", "\n", "# Create a large array\n", "large_array = np.random.randn(1_000_000)\n", "\n", "# Method 1: Using a loop (slow)\n", "start = time.time()\n", "squared_loop = []\n", "for x in large_array:\n", " squared_loop.append(x**2)\n", "loop_time = time.time() - start\n", "\n", "# Method 2: Vectorized operation (fast)\n", "start = time.time()\n", "squared_vec = large_array**2\n", "vec_time = time.time() - start\n", "\n", "print(f\"Loop time: {loop_time:.4f} seconds\")\n", "print(f\"Vectorized time: {vec_time:.4f} seconds\")\n", "print(f\"Speedup: {loop_time/vec_time:.1f}x\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9191603d-e639-4dab-983b-ad8493bd3e5a", "metadata": {}, "outputs": [], "source": [ "# Basic statistical methods\n", "sample_data = np.random.normal(50, 10, 200)\n", "\n", "print(\"Basic statistics:\")\n", "print(f\"Mean: {sample_data.mean():.2f}\")\n", "print(f\"Standard deviation: {sample_data.std():.2f}\")\n", "print(f\"Minimum: {sample_data.min():.2f}\")\n", "print(f\"Maximum: {sample_data.max():.2f}\")\n", "print(f\"Median: {np.median(sample_data):.2f}\")" ] }, { "cell_type": "markdown", "id": "27607860-7f51-42bb-b57b-c89a7436266d", "metadata": {}, "source": [ "## 3. First 1D Data Analysis\n", "\n", "Let's analyze a dataset step by step, building our intuition about distributions." ] }, { "cell_type": "code", "execution_count": null, "id": "9f04f472-8cf4-4c89-a76a-b6731c86db2f", "metadata": {}, "outputs": [], "source": [ "# Generate sample data - imagine these are test scores\n", "np.random.seed(42)\n", "test_scores = np.random.normal(loc=75, scale=12, size=1000)\n", "\n", "# Ensure scores are in valid range [0, 100]\n", "test_scores = np.clip(test_scores, 0, 100)\n", "\n", "print(f\"Generated {len(test_scores)} test scores\")\n", "print(f\"First 10 scores: {test_scores[:10].round(1)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c8d51042-8d62-499b-8e5c-9ca7102ed4c3", "metadata": {}, "outputs": [], "source": [ "# Visualize with histogram and KDE\n", "fig, ax = plt.subplots(figsize=(10, 6))\n", "\n", "# Histogram\n", "n, bins, patches = ax.hist(test_scores, bins=30, density=True, alpha=0.7, \n", " color='skyblue', edgecolor='black', label='Histogram')\n", "\n", "# KDE (Kernel Density Estimation)\n", "kde_xs = np.linspace(test_scores.min(), test_scores.max(), 200)\n", "kde = stats.gaussian_kde(test_scores)\n", "ax.plot(kde_xs, kde(kde_xs), 'r-', linewidth=2, label='KDE')\n", "\n", "ax.set_xlabel('Test Score')\n", "ax.set_ylabel('Density')\n", "ax.set_title('Distribution of Test Scores')\n", "ax.legend()\n", "ax.grid(True, alpha=0.3)\n", "\n", "plt.show()\n", "\n", "# Discussion about bin selection\n", "print(f\"Using {len(bins)-1} bins\")\n", "print(\"Try changing 'bins' parameter to see how it affects the visualization!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7bb1ac0f-a68a-49a8-b2b6-273df6bede67", "metadata": {}, "outputs": [], "source": [ "# Calculate quantiles and percentiles\n", "percentiles = [10, 25, 50, 75, 90]\n", "quantile_values = np.percentile(test_scores, percentiles)\n", "\n", "print(\"Percentiles:\")\n", "for p, v in zip(percentiles, quantile_values):\n", " print(f\" {p}th percentile: {v:.2f}\")\n", "\n", "# Calculate IQR (Interquartile Range)\n", "q1, q3 = np.percentile(test_scores, [25, 75])\n", "iqr = q3 - q1\n", "print(f\"\\nInterquartile Range (IQR): {iqr:.2f}\")\n", "print(f\"This means the middle 50% of scores fall within a range of {iqr:.2f} points\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cad696f3-45c8-447a-be17-32360d875aaf", "metadata": {}, "outputs": [], "source": [ "# Box plot visualization\n", "fig, ax = plt.subplots(figsize=(8, 6))\n", "\n", "box_plot = ax.boxplot(test_scores, vert=True, patch_artist=True,\n", " tick_labels=['Test Scores'])\n", "\n", "# Customize box plot\n", "box_plot['boxes'][0].set_facecolor('lightblue')\n", "box_plot['medians'][0].set_color('red')\n", "box_plot['medians'][0].set_linewidth(2)\n", "\n", "# Add annotations\n", "ax.text(1.1, np.median(test_scores), f'Median: {np.median(test_scores):.1f}', \n", " va='center')\n", "ax.text(1.1, q1, f'Q1: {q1:.1f}', va='center')\n", "ax.text(1.1, q3, f'Q3: {q3:.1f}', va='center')\n", "\n", "ax.set_ylabel('Score')\n", "ax.set_title('Box Plot of Test Scores')\n", "ax.grid(True, alpha=0.3)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "7fb34b8c-c824-454a-bee9-1ef92129b4f0", "metadata": {}, "outputs": [], "source": [ "# Outlier detection using Z-scores\n", "mean = test_scores.mean()\n", "std = test_scores.std()\n", "\n", "# Calculate z-scores\n", "z_scores = (test_scores - mean) / std\n", "\n", "# Find outliers (typically |z| > 3, but let's use 2.5 for this example)\n", "outlier_threshold = 2.5\n", "outliers_mask = np.abs(z_scores) > outlier_threshold\n", "outliers = test_scores[outliers_mask]\n", "\n", "print(f\"Number of outliers (|z| > {outlier_threshold}): {len(outliers)}\")\n", "print(f\"Outlier values: {outliers.round(1)}\")\n", "\n", "# Visualize outliers\n", "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n", "\n", "# Histogram with outliers marked\n", "ax1.hist(test_scores, bins=30, alpha=0.7, color='skyblue', edgecolor='black')\n", "ax1.scatter(outliers, np.zeros_like(outliers), color='red', s=100, \n", " marker='^', label=f'Outliers (|z| > {outlier_threshold})')\n", "ax1.set_xlabel('Test Score')\n", "ax1.set_ylabel('Frequency')\n", "ax1.set_title('Distribution with Outliers Marked')\n", "ax1.legend()\n", "\n", "# Z-scores distribution\n", "ax2.hist(z_scores, bins=30, alpha=0.7, color='lightgreen', edgecolor='black')\n", "ax2.axvline(-outlier_threshold, color='red', linestyle='--', label=f'z = ±{outlier_threshold}')\n", "ax2.axvline(outlier_threshold, color='red', linestyle='--')\n", "ax2.set_xlabel('Z-score')\n", "ax2.set_ylabel('Frequency')\n", "ax2.set_title('Z-score Distribution')\n", "ax2.legend()\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "1efed29e-e7fb-4e09-90ca-f68534c74d1e", "metadata": {}, "source": [ "## 4. Creating a Reusable Analysis Function\n", "\n", "Let's wrap our analysis steps into a function. Fill in the TODO sections!" ] }, { "cell_type": "code", "execution_count": null, "id": "d97cfc33-bf48-46ae-bd11-632ac2e78495", "metadata": {}, "outputs": [], "source": [ "def analyze_1d_data(data, name=\"Data\", outlier_z=2.5, bins=30):\n", " \"\"\"\n", " Comprehensive analysis of 1D data\n", " \n", " Parameters:\n", " -----------\n", " data : array-like\n", " The data to analyze\n", " name : str\n", " Name for the dataset (for plot titles)\n", " outlier_z : float\n", " Z-score threshold for outlier detection\n", " bins : int\n", " The number of bins for the histogram\n", " \n", " Returns:\n", " --------\n", " stats_dict : dict\n", " Dictionary containing statistical summaries\n", " fig : matplotlib figure\n", " Figure with visualizations\n", " \"\"\"\n", " # Convert to numpy array\n", " data = np.array(data)\n", " \n", " # Calculate statistics\n", " stats_dict = {\n", " 'mean': None, # TODO: calculate mean\n", " 'std': np.std(data),\n", " 'median': None, # TODO: calculate median\n", " 'q1': np.percentile(data, 25),\n", " 'q3': np.percentile(data, 75),\n", " 'iqr': None, # TODO: calculate IQR using q1 and q3\n", " 'min': data.min(),\n", " 'max': data.max()\n", " }\n", "\n", " # Calculate outliers\n", " z_scores = (data - stats_dict['mean']) / stats_dict['std']\n", " outliers_mask = None # TODO: create boolean mask for outliers using outlier_z\n", " n_outliers = np.sum(outliers_mask)\n", " \n", " # Create visualizations\n", " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\n", " \n", " # Left plot: Histogram and KDE\n", " # TODO: Create histogram on ax1\n", "\n", " # Add KDE\n", " kde_xs = np.linspace(data.min(), data.max(), 200)\n", " kde = stats.gaussian_kde(data)\n", " ax1.plot(kde_xs, kde(kde_xs), 'r-', linewidth=2, label='KDE')\n", " \n", " ax1.set_xlabel('Value')\n", " ax1.set_ylabel('Density')\n", " ax1.set_title(f'{name}: Distribution')\n", " ax1.legend()\n", " ax1.grid(True, alpha=0.3)\n", " \n", " # Right plot: Box plot\n", " # TODO: Create box plot on ax2\n", "\n", " ax2.set_ylabel('Value')\n", " ax2.set_title(f'{name}: Box Plot')\n", " ax2.grid(True, alpha=0.3)\n", " \n", " plt.suptitle(f'Analysis of {name} (n={len(data)}, outliers={n_outliers})')\n", " plt.tight_layout()\n", " \n", " return stats_dict, fig" ] }, { "cell_type": "markdown", "id": "bcab7809-0ea2-4f87-b390-b2269917b372", "metadata": {}, "source": [ "## 5. Analyzing Sophisticated Examples\n", "\n", "Now let's apply our function to more complex distributions! Summarize the features of each dataset with a sentence." ] }, { "cell_type": "code", "execution_count": null, "id": "deeb55c8-01f1-4824-9077-20b4cbd5c6b9", "metadata": {}, "outputs": [], "source": [ "# TODO: create a funciton that converts a string in the format below into a NumPy array\n", "def string_to_dataset(s):\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "id": "78738bcd-ddad-4660-8816-4a3a94159647", "metadata": {}, "outputs": [], "source": [ "dataset_string_1 = '''\n", "[44.5 0. 50. 60. 59.3 50.4 48.5 36.1 45.7 72. 39.2 37.4 28.5 41.2\n", " 0. 60.6 33.8 32.3 62.6 41. 58.9 43.2 59.6 58.9 50.7 52.4 58.2 52.4\n", " 42.6 66.5 43. 49. 52.5 67.5 0. 31.1 46.7 51.7 60.7 48.9 48.1 53.4\n", " 48.8 58.7 47.5 22.1 43.3 48.4 54.7 32.4 48. 39.1 33.6 51.3 52.9 46.1\n", " 54.2 47.7 50.5 46.5 51.6 51.4 53.2 29.5 69.3 40.1 47.7 46.3 46.4 59.6\n", " 61. 43.3 60.7 46.5 46.4 61.6 64.9 48.7 37.5 55.9 44.2 37.6 32.5 57.5\n", " 48.3 51.8 66.1 42.5 38.5 49.6 61.8 43.1 42. 62.6 41. 41.3 51.2 57.7\n", " 45.9 60.4 48.6 57.7 44.1 47.2 48. 35.3 44.5 40.3 60.9 43.6 68. 68.1\n", " 59.8 55.7 65. 51.3 52.5 59.6 47.2 51.4 47.6 48.6 31.6 60.3 49.9 43.3\n", " 50.3 0. 53.8 40.8 49.9 50. 51.3 54.4 53.6 43.2 55.8 32.7 43.4 43.2\n", " 49. 53.9 53.8 47.5 60.7 55.3 60.4 51.5 56.7 49.9 76. 46.5 63.2 65.\n", " 42.4 35.7 43.3 37.5 56.1 56.9 67.4 59.7 36.7 48.8 36.7 50.9 46.4 66.\n", " 27.6 51.7 45.2 35.8 59.9 47.3 0. 57.5 35.8 44. 46.5 53. 59.5 57.5\n", " 63.3 64.7 39.6 43.7 53.8 40.4 72.1 51.2 40.7 58.5 41.5 60.5 44.5 67.2\n", " 47.7 51. 55.9 55.4 49. 43.4 49.6 0. 30.8 40.1 57.1 69.4 50.4 45.7\n", " 42.6 60.7 36.4 49.5 38.6 58.1 57.8 53.1 67.8 36.8 25.6 0. 32.9 0.\n", " 56.4 51.8 28.8 55. 40.7 47. 43.6 41.8 40.2 35.5 25.8 37.4 39.1 41.2\n", " 43.9 65.2 56.3 62.5 32.4 45.3 66.3 66.6 49.4 54.3 43.9 62.6 61.6 48.1\n", " 36.3 36. 28.3 61.4 0. 66.4 55.9 52.7 58.1 54.4 60.9 50.6 33. 47.5\n", " 44.5 42.7 63.2 59.3 0. 67.2 31.4 37.3 49.7 59.4 47.1 52.6 45.9 53.\n", " 63.3 29.6 62.7 54.5 60.8 38. 55.1 44.7 35.9 58.2 50.7 39. 79.6 46.\n", " 46.5 65.2 33.1 48.6 59.8 66.3 48.6 62. 37.4 75.6 35.8 49.1 49.7 61.1\n", " 43.2 46. 27.5 53.8 53.5 53.7 43.1 43.9 46.2 70.9 56.9 47.8 53.2 61.7\n", " 56.8 64.9 53.4 62.6 52.2 40.3 65.9 55.7 42.3 63.7 48.5 59.1 47.8 51.4\n", " 40.4 39.6 57.5 36.9 45.7 37.9 51.2 0. 63.1 41.7 37.3 54. 44. 50.7\n", " 43.2 44.5 0. 55.9 51.6 47.7 73.9 64.6 41.9 49.9 50.1 43.1 58.9 0.\n", " 57.5 51.4 42.4 0. 0. 39.5 48.7 55.6 38.2 36.7 34.7 61.7 46.9 45.5\n", " 59.1 61.6 64.2 54.7 39.9 46.5 64. 38.8 62.6 57. 0. 47.2 52.8 50.5\n", " 65.3 54.1 44.5 48.3 85.7 45.6 35.3 0. 51.1 22.1 57. 55.7 46.5 62.7\n", " 54.6 59.5 48.6 45. 65.2 52.8 58.7 52.8 0. 67. 58.9 0. 65.3 39.6\n", " 48.1 54.1 39.3 61.5 58.1 57.7 0. 49.8 36.4 43.8 46.6 53.7 50.2 38.8\n", " 56.3 55.2 57.4 39.5 41.3 29.7 35.2 36.3 46.8 33.4 49.4 56. 52.7 34.2\n", " 47.6 38.4 46.6 62.2 59. 36. 35. 47.9 57.5 47.7 46.8 53.9 49.1 49.6\n", " 37.4 66.5 39.2 49.7 49.3 49.4 59.8 36.2 50. 0. 44.9 67.4 56.7 45.9\n", " 50.3 52. 36.3 51.1 54.2 40.6 52.8 52.5 54.3 57.3 37.1 51.2 52.8 48.9\n", " 49. 34.8 0. 66.1 40.3 44.7 31.3 61.3 59. 52.8 22. 39. 25.7 0.\n", " 59.9 50.5 0. 69.9 56.4 0. 64.5 47.4 49.9 56.1 46.2 0. 56.8 49.7\n", " 68.7 30.2 49.9 45.7 64.1 42.7 38.7 0. 63.4 43.8 25.6 53.5 45.8 39.\n", " 50.3 60.1 60.3 0. 59.1 56. 40. 73.3 60.7 48.4 51. 42.4 42.1 28.1\n", " 68.6 43.6 51.3 45.8 51.9 40.5 51.3 52.5 50.8 52. 33.9 58. 57. 49.3\n", " 60.1 58.9 50.5 42.6 51.9 46.1 44.7 35.8 61.3 42.6 65.4 48.3 41.3 46.1\n", " 47.7 49. 58.9 44.5 51.4 0. 45.4 56.9 35.9 48.7 41.7 52.8 51.6 43.\n", " 46.9 34.1 48.4 63.1 70.6 55. 38.4 53.1 50.8 41.8 38.7 43.5 59.3 56.3\n", " 72.4 45.6 45.4 52.8 57.4 61.8 37.9 56.7 36.9 61.1 28.8 47.4 53.9 51.6\n", " 52. 69.8 55.8 62.2 33.5 44.2 43.2 55.6 59.7 39.1 0. 0. 48.7 54.8\n", " 56.7 48.4 48.3 0. 57.9 56.7 66.2 34. 44.1 47.5 39.1 53.2 54.1 33.9\n", " 49.2 72.5 0. 39.8 48.8 54.2 49.7 44.4 55.1 31. 66. 40.1 0. 32.3\n", " 53.4 41.8 45.5 56.9 0. 52.1 52.3 44.5 68.8 31.2 43.1 39.4 71.9 54.7\n", " 47.3 44.5 44.7 59.8 68.5 39.2 50.5 48.7 59.5 50.6 61.9 46.2 55.5 0.\n", " 67.2 52.9 34. 34.9 56.9 49. 49.2 51.8 55.8 44.3 54.2 48.8 41.5 51.2\n", " 55.9 37.9 0. 44.1 55.5 49.6 51.1 47.9 40.8 38. 46.7 47.3 40.9 36.5\n", " 42.1 39.2 57.4 42.8 45.3 56.4 52.8 67.9 54.1 63.4 50.9 0. 29. 41.2\n", " 41.4 62.3 62.6 52.3 53.1 57.6 17.7 62.2 44.7 55.8 64.9 45.7 51.6 51.7\n", " 53.1 41.6 58.8 44. 40. 0. 60.6 47.9 60.3 56. 53.6 52.8 0. 47.3\n", " 48.5 33.9 60. 56.9 60.6 44.5 23.7 50.7 30.7 0. 62.4 53. 47.5 45.1\n", " 57.5 53.6 42.7 46.2 46.5 51.5 50.4 34.4 0. 51.8 0. 47.5 42.3 47.3\n", " 50. 0. 43.4 53.3 50.3 65.2 34.3 36.9 46.8 50.2 34.2 0. 69.3 49.9\n", " 38.5 53.3 52.1 53.9 58.4 48.3 66.1 69.6 69.6 0. 63.8 50.3 42.5 48.3\n", " 44. 51.5 0. 48.1 0. 46.7 41.8 43.3 57.7 43.2 38.8 61.8 52. 44.2\n", " 41.8 0. 44.8 46. 66.1 37.3 0. 42.4 52.5 36.2 46. 65.2 46. 37.8\n", " 48.7 47.3 53. 50.9 47.9 24.4 38.3 58.1 57.9 59.9]\n", " '''" ] }, { "cell_type": "code", "execution_count": null, "id": "612f13a1-1c8d-49e2-b738-df09a1db66f7", "metadata": {}, "outputs": [], "source": [ "dataset_string_2 = '''\n", "[39.7 79.6 38. 74.6 28.6 69.5 68.5 23.8 28.8 73.2 71.6 68.2 39.2 28.4\n", " 27.6 69.5 34.9 76.3 25.1 60.8 36.9 66.1 36. 71. 73.4 74.7 66.6 32.9\n", " 28.4 32.2 31.5 77.5 71. 33.7 29.6 31.5 25.2 27.3 39.1 27.7 68.6 31.3\n", " 27.2 73.7 34.3 72.4 30.1 68.8 27.3 28.5 58.5 33.4 72.4 72.4 40.7 70.8\n", " 68.3 32.8 27.5 72.9 65.6 31.8 67.2 36. 62.3 76.9 29.2 26.8 74.6 69.4\n", " 42. 24.4 31.9 63.9 31.3 60.6 29.6 62.8 68. 67.2 74.1 79.2 77.2 74.5\n", " 72.5 63.8 64. 28.8 42.9 39.6 73.4 61.2 25.8 73.6 34.5 61.7 25.6 37.7\n", " 80.3 71.3 62.7 27.3 37.4 65. 24.2 72.8 67.9 76.5 26.7 78.4 62.9 70.1\n", " 62.7 67.6 72. 65.7 26.7 69. 30.7 30.1 64.9 69.1 59.3 69.5 24.3 30.5\n", " 73.4 36. 26.8 24.6 74.8 72.5 34.4 32. 35.7 67.9 68.8 28.1 68.4 64.1\n", " 72.7 66.8 71. 73.1 61. 30.4 19.9 75.7 35.7 35.7 68.2 66. 68.8 78.7\n", " 28.8 79.7 76.9 78.2 64.5 23.9 32.1 64.2 29.6 74.1 73. 21.3 34.7 26.1\n", " 76.3 32.2 34.6 28.4 30.7 22.5 33.5 62.7 62. 28. 25.7 70.8 24.9 31.2\n", " 74.6 70.6 30.5 71.7 33. 69.5 36.8 32.9 25.5 75.7 18.9 25.7 40.1 41.4\n", " 25. 15.3 71.1 64.3 72.9 33.4 65.6 30.4 72.4 33.4 70.1 28.9 67.7 69.7\n", " 70.5 65.3 26.4 67.5 25. 65.4 68.6 24.8 58.2 18.4 27.5 71.3 23.5 76.2\n", " 21.3 73.7 33.1 72.3 63.5 73.7 26. 38.1 27.6 74.8 71.7 65.6 34.3 70.4\n", " 34.4 67. 66.3 30.2 61.7 25.5 67. 60.6 28.4 25.8 31.9 69.2 26.1 30.6\n", " 31.8 20.5 19.8 65.6 28.5 33.8 66.5 68.4 34. 23.5 28.6 24.2 64.9 57.2\n", " 60.6 35.4 28.5 69.6 70.7 77. 72.9 71.1 84.1 29.5 25.9 71.2 31. 30.4\n", " 33.1 38.6 71. 31.8 67.2 35.7 35.9 74.5 69.8 29.3 65.4 77.1 35.7 74.\n", " 29.9 72.2 35.5 29.5 79.1 75.4 23. 68.8 70.8 28.7 67.5 34.7 35.1 65.2\n", " 75.6 24.7 33.2 64.6 35.8 20.4 72.5 20.9 28.2 69.7 29.7 81.7 29.4 35.8\n", " 71.3 31.8 71. 72.6 29.4 32.8 68.9 71.3 29.3 33.7 33.9 67.2 31.5 74.8\n", " 20.9 26.4 26.1 72.8 66.8 36.6 77.3 74.6 29.9 28.3 64. 33.2 24.8 28.2\n", " 67.9 30.2 21.6 70.8 62.8 73.3 77.4 23.9 26.3 73.2 30.4 30.6 35.5 68.8\n", " 34.3 31.2 22.8 66.9 76.6 67.6 75.5 70.2 30. 64.5 66.7 28.3 77.1 26.\n", " 30.4 36.2 41.6 20.7 26.2 72.4 65.5 29.4 26.6 63.7 32.9 77. 29.7 65.5\n", " 36.9 79.8 35.9 67. 69.6 30.3 73.6 57.5]\n", "'''" ] }, { "cell_type": "code", "execution_count": null, "id": "5b723918-4189-42fd-ad35-160812b56179", "metadata": {}, "outputs": [], "source": [ "dataset_string_3 = '''\n", "[ 97.9 117.8 102.6 116.2 87.7 137. 52.8 83.3 92.5 103.3\n", " 124. 122. 74.7 101. 103.9 137.9 93.6 84.7 130.2 111.1\n", " 116.5 100.2 92.2 52.8 78.2 67.9 112.2 119. 92.9 114.7\n", " 93.6 101. 106.2 99.5 95.7 77. 62.3 96.5 106.8 118.3\n", " 116.5 40.2 75.9 111.9 26.8 102.5 107.6 121.4 96.6 111.6\n", " 83.2 50.9 -5.6 48.6 109.5 80.8 111.1 125.5 107.7 131.3\n", " 91.7 110.9 127.6 71.1 85.2 113.7 137.7 117.8 60.8 86.1\n", " 45.7 88.3 101.2 62.2 135.1 121.1 104.4 112.5 128.2 115.9\n", " 92.8 92.1 102.4 91.8 148.2 123.9 115.3 109.7 66.8 138.6\n", " 119. 102.8 47.5 36.2 104. 87.7 58.7 107.9 119. 87.1\n", " 99.3 140.9 80.2 62.2 111.6 106.6 95.3 122.4 97. 110.7\n", " 354.4 83.7 88. 101. 100.7 87. 88.3 106.5 99.2 116.1\n", " 69.2 87.7 112.5 97.9 122.8 77.1 49.7 140.2 112.1 32.2\n", " 104.1 102.8 107.5 147.3 68.3 137.9 87.8 162.7 100.5 128.8\n", " 138.3 79.8 106. 110.2 93.9 118.3 104.2 69.5 97.1 146.\n", " 36.6 99.1 87.2 106.6 126. 102. 156.5 79.8 70.3 101.2\n", " 98.9 145. 105.1 120.5 109.6 86.2 140.9 89.9 89. 110.7\n", " 104. 113.4 67.2 40. 105.1 106.7 -7.8 122.3 169.1 72.1\n", " 121. 119.4 111.7 98. 111.9 83.1 110.6 134. 116.9 125.6\n", " 113.9 87.4 91.4 79.6 104.5 143.4 83.9 143.4 90.5 97.\n", " 109. 62.4 111.8 120.9 118.6 109. 114.5 97.7 120.8 103.2\n", " 69.3 104.5 100.9 117.8 94.1 136.6 119.5 87.7 185.4 115.6\n", " 177.3 62.9 124.6 67. 32.7 99.4 117.5 98.4 210.7 108.3\n", " 112.5 110.4 87.3 60.6 180. 74. 97.4 233. 96.7 83.5\n", " 79.6 84.3 103.6 91.4 84.9 78.7 105.8 117.2 78. 75.8\n", " 111.1 97.6 95.7 95.1 112.9 100.2 96.1 120.8 79.9 89.9\n", " 230.8 131.1 62. 117.5 147.2 121.9 98.2 113. 87.5 -147.1\n", " 91.2 97.3 244.8 110.4 101. 103.8 78.5 111.6 78.7 81.5\n", " 105.3 109.6 102.1 102. 50.3 103.3 74.8 103.8 80.7 266.2\n", " -20.2 82.7 32.1 73.6 95.5 96.5 60.4 84.2 107.3 66.2\n", " 80.1 147. 70.2 135.7 81.3 108.3 83.2 108.7 62.7 108.1\n", " 102.8 91.8 75.5 210.5 107.1 35.3 115.9 115.1 89.8 105.5\n", " 101.2 89.7 66.5 116.7 85.7 92.1 86.5 124.9 104.7 107.\n", " 92. 116.1 55.3 113.4 78.5 127.2 117.5 109.9 95.5 100.9\n", " 125.7 96.2 103.2 104.4 91.8 120.3 106. 65.7 91.2 79.9\n", " 139.6 126.2 107.2 77.2 101.9 22.6 118. 99. 99.8 107.1\n", " 88.5 87.7 100. 82.7 110.2 91.1 95.2 54.7 90.3 86.1\n", " 93.8 193.3 98.7 122.5 86.7 103.1 113.4 129.7 126.5 72.3\n", " 97.6 20. 107.2 111.4 126.3 61.1 107.6 107.7 125.2 72.6\n", " 107.1 88.2 117.2 106. 82.3 141.7 81.3 115.8 118.3 120.\n", " 84. 115.6 74.2 96.9 118.8 14.4 80.7 96.5 113.4 102.6\n", " 70.5 137.3 123.1 83.6 82.7 71.3 86. 86.6 126.2 88.7\n", " 125.8 102.5 88. 131.8 104.2 100.7 100.1 65.1 108.7 68.1\n", " 100.6 110.3 107.2 168.1 73.1 47.4 99.2 100. 103.6 87.1\n", " 106.6 181.5 114.2 93.4 105.3 97.1 82.4 117.7 64.1 124.3\n", " 120. 97.7 124.4 103.2 99.7 92.3 145.9 111.7 109.4 107.\n", " 104.7 155.9 73.7 94.8 60.6 96.2 74.3 88.1 77.3 116.7\n", " 96.4 88.4 189.6 102.3 83.5 102.5 126.6 45.5 110.5 87.9\n", " 94.9 209.8 125. 124.1 75.4 94.8 80.2 63.9 -6.3 91.7\n", " 101.2 91.6 84.2 118.1 50.4 111.5 111.3 97.4 66.5 86.7\n", " 103.8 104.5 78.6 51.4 82.2 64. 135.3 221.7 111.2 92.5\n", " 123.2 51.8 114.6 79.6 65.2 91.2 113.2 119.2 121.4 103.7\n", " 73.1 94.3 142.9 111.3 122.4 123. 19.6 111.4 103.9 97.8\n", " 59.8 120.6 139.8 133. 86. 71.3 89.1 101. 129.2 93.7\n", " 104.1 165.3 131.3 119.2 79.1 103.8 90. 119.6 89.8 57.5\n", " 105.6 15.2 93.9 96.1 89. 80.8 109. 100.4 36.9 80.7\n", " 119.9 116.4 121.8 111.5 80. 103.3 88.5 121.7 106. 114.2\n", " 89.8 91.3 115.1 83.5 115. 126. 75.2 110.6 150.7 87.9\n", " 105.3 76.6 116.9 95.7 80.1 132.9 115.4 132.5 59.5 79.6\n", " 111.8 93.9 199.7 85.4 110. 112.3 111.4 99.1 139.4 101.2\n", " 89.6 127.2 45.7 239.2 106.9 67.2 110.6 112.4 84.1 43.3\n", " 124.9 122. 66.2 42.7 45.3 84.8 111.8 89.5 134.5 49.7\n", " 111.8 105.3 90.2 114.8 110.8 94.5 99.4 99.6 128.6 40.5\n", " 70.9 106.3 94.8 70.4 90.4 108. 70. 94.1 134. 91.3\n", " 85.3 126.6 127.3 110.1 122.8 118.5 100.8 127.4 94.7 85.2\n", " 127. 80.7 100.2 119.7 67.3 92.4 94.3 75.3 104.6 102.5\n", " 104.7 94.6 73.7 106.6 109.7 89.2 109.1 104.1 121.1 83.3\n", " 105.3 105.5 124.6 141.4 91. 83.8 109.9 71.9 97.1 93.7\n", " 108.2 123.1 105.8 43.1 109.5 89.5 148. 90.4 74.9 110.8\n", " 180.4 105.4 102.9 117.3 120.2 92.5 105.1 76.6 71. 85.5\n", " 106.3 97.5 68.2 103.4 109.6 85.2 104.7 87. 43.6 127.3\n", " 85.2 115.8 111.1 92. 115.4 99.2 65.8 46.4 129.5 103.4\n", " 106.8 90.3 54.9 74.9 101.8 106. 63.3 103.5 82. 96.6\n", " 101.3 152.3 59.1 86.4 80.1 118.5 101.6 108.7 128. 125.1\n", " 87.3 101.8 110.4 66.9 116.4 114.2 91. 78. 90.8 125.7\n", " 130.4 86.6 5.2 102.2 137.1 100.6 110.1 119.7 95. 50.3\n", " 110.6 78.5 92.5 137.1 77.4 77.8 110.1 79.4 143.9 129.3\n", " 101.6 178.4 109.8 50. 118.5 101.7 119.3 75. 100.9 103.6\n", " 171.7 117.7 134.5 79.9 102.5 166.5 148. 81.8 87.1 48.8\n", " 111.9 65.7 101.3 142.9 69.2 129.9 62.8 176.6 111.6 104.\n", " 86. 85.7 113.2 95.1 56.3 106.2 77.1 118.8 95.9 111.6\n", " 99.5 84.8 100.3 140.6 73.2 93.6 83.3 110.3 114.5 117.7\n", " 69.1 99.3 69.8 100.5 98.9 67.6 110.7 104.1 110.1 115.4\n", " 109.3 104.5 108.5 116.1 72.2 97.6 120. 150.7 124.2 81.9\n", " 119.1 24.5 84.4 147.5 86.7 96.9 116.4 93.2 72.3 108.6\n", " 115.6 105.1 89. 69.7 21. 92.5 99.1 95.4 124.5 88.7\n", " 45.8 110.1 127.4 81.4 84. 94.4 98.5 140.9 117.9 66.1\n", " 62.7 124.8 93.9 80.8 90.6 105.5 104.9 86.5 94.9 112.8\n", " 108.5 77.9 94.6 112.6 -2.3 87.6 110.8 106.9 130.8 109.7\n", " 73.6 159.1 98.5 117.4 80.9 92. -173. 120.1 113.3 128.5\n", " 65. 63.1 97.9 102.5 114. 120.6 141.4 111. 111.6 129.7\n", " 80.1 85.8 56.9 116.3 80. 91.8 17.7 101.1 90.6 91.2\n", " 109.4 77.2 68.6 130.1 228.8 86.4 91.3 85.7 131.6 128.2\n", " 109.9 132. 74.5 112.1 36.3 145.4 85.2 48.5 72.2 101.7\n", " 104.1 104.1 92.1 66.7 103.3 113.3 95. 102.9 63.7 105.6\n", " 88.1 180.1 98.3 36.2 82.1 115.9 101.6 91. 89.5 94.4\n", " 68.2 165.9 44.1 111.7 110.4 154.1 119.8 100.7 130.5 106.7\n", " 38.6 85.3 145. 101.7 99.5 124.8 86.8 99.6 103.8 92.6\n", " 109.3 35.2 142.4 57.8 96.2 92. 111.1 157.6 103.5 88.7\n", " 111.7 -27.1 151.8 82.6 119.7 120.5 93.3 94.4 105.7 97.2]\n", "'''" ] }, { "cell_type": "markdown", "id": "e0b55867-5509-472d-b364-239d4cfe913d", "metadata": {}, "source": [ "## 6. Distribution Transformation Challenge\n", "\n", "Sometimes we need to transform data to make it easier to work with. Let's explore this concept!" ] }, { "cell_type": "code", "execution_count": null, "id": "c1fc07df-b5a4-4fb9-afb6-d4c01bbc4595", "metadata": {}, "outputs": [], "source": [ "# Demonstration: Log transformation\n", "np.random.seed(111)\n", "\n", "# Generate log-normal data (common in nature: income, city sizes, etc.)\n", "log_normal_data = np.random.lognormal(mean=3, sigma=0.5, size=1000)\n", "\n", "# Apply log transformation\n", "transformed_data = np.log(log_normal_data)\n", "\n", "# Visualize before and after\n", "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\n", "\n", "# Original data\n", "ax1.hist(log_normal_data, bins=40, alpha=0.7, color='coral', edgecolor='black')\n", "kde = stats.gaussian_kde(log_normal_data)\n", "kde_xs = np.linspace(log_normal_data.min(), log_normal_data.max(), 200)\n", "ax1.plot(kde_xs, kde(kde_xs) * len(log_normal_data) * (kde_xs[1] - kde_xs[0]), \n", " 'r-', linewidth=2)\n", "ax1.set_xlabel('Value')\n", "ax1.set_ylabel('Frequency')\n", "ax1.set_title('Original: Log-Normal Distribution (skewed)')\n", "\n", "# Transformed data\n", "ax2.hist(transformed_data, bins=40, alpha=0.7, color='lightgreen', edgecolor='black')\n", "kde = stats.gaussian_kde(transformed_data)\n", "kde_xs = np.linspace(transformed_data.min(), transformed_data.max(), 200)\n", "ax2.plot(kde_xs, kde(kde_xs) * len(transformed_data) * (kde_xs[1] - kde_xs[0]), \n", " 'g-', linewidth=2)\n", "ax2.set_xlabel('log(Value)')\n", "ax2.set_ylabel('Frequency')\n", "ax2.set_title('Transformed: Normal Distribution!')\n", "\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "print(\"Notice how the log transformation turned a skewed distribution into a bell curve!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "087fe3e5-0f0c-4f42-a1a0-01aca4113108", "metadata": {}, "outputs": [], "source": [ "# Challenge: Mystery distribution\n", "np.random.seed(222)\n", "\n", "# Generate a mystery distribution\n", "uniform_base = np.random.uniform(0, 1, 1000)\n", "mystery_data = uniform_base ** 2 * 100 # This creates a specific shape\n", "\n", "print(\"You've been given a mystery dataset!\")\n", "print(f\"Range: [{mystery_data.min():.1f}, {mystery_data.max():.1f}]\")\n", "print(f\"Mean: {mystery_data.mean():.1f}\")\n", "print(\"\\nYour task: Find a transformation that makes this data roughly uniform or normal!\")\n", "\n", "# Visualize the mystery data\n", "plt.figure(figsize=(8, 5))\n", "plt.hist(mystery_data, bins=40, alpha=0.7, color='purple', edgecolor='black')\n", "plt.xlabel('Value')\n", "plt.ylabel('Frequency')\n", "plt.title('Mystery Distribution - Can you transform it?')\n", "plt.show()\n", "\n", "# TODO: Try different transformations\n", "# Hints: Consider sqrt, log, power transformations\n", "transformed = None # TODO: Apply your transformation\n", "\n", "# TODO: Visualize your transformed data\n", "# Is it more uniform? More normal-looking?" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }