Tidy-TS

Comprehensive Workflows

Real-world data analysis workflows that combine all tidy-ts features. These examples show how to use the library effectively.

Complete Data Analysis Pipeline

A complete workflow combining all major tidy-ts operations

This example demonstrates a complete data analysis workflow from data loading through transformation, filtering, grouping, and summarization.

import { createDataFrame, stats as s, readCSV } from "@tidy-ts/dataframe";
import { z } from "zod";

// 1. Load and validate data
const PersonSchema = z.object({
  name: z.string(),
  age: z.number(),
  city: z.string(),
  score: z.number(),
});

const csvData = `name,age,city,score
Alice,25,New York,85
Bob,30,Los Angeles,92
Carol,28,Chicago,78
Dave,35,Houston,88`;

const people = await readCSV(csvData, PersonSchema);

// 2. Complete analysis pipeline
const analysis = people
  // Transform data
  .mutate({
    age_group: (row) => {
      if (row.age < 25) return "Young";
      if (row.age < 35) return "Adult";
      return "Senior";
    },
    score_category: (row) => {
      if (row.score >= 90) return "Excellent";
      if (row.score >= 80) return "Good";
      if (row.score >= 70) return "Fair";
      return "Poor";
    },
    z_score: (row, _index, df) => {
      const mean = s.mean(df.score);
      const std = s.stdev(df.score);
      return s.round((row.score - mean) / std, 3);
    },
  })
  // Filter data
  .filter((row) => row.score >= 80)
  // Group and summarize
  .groupBy("age_group")
  .summarise({
    count: (group) => group.nrows(),
    avg_score: (group) => s.round(s.mean(group.score), 2),
    max_score: (group) => s.max(group.score),
    min_score: (group) => s.min(group.score),
    score_std: (group) => s.round(s.stdev(group.score), 2),
  })
  // Sort results
  .arrange("avg_score", "desc");

analysis.print("Complete analysis results:");

import { createDataFrame, stats as s, readCSV } from "@tidy-ts/dataframe";
import { z } from "zod";

// 1. Load and validate data
const PersonSchema = z.object({
  name: z.string(),
  age: z.number(),
  city: z.string(),
  score: z.number(),
});

const csvData = `name,age,city,score
Alice,25,New York,85
Bob,30,Los Angeles,92
Carol,28,Chicago,78
Dave,35,Houston,88`;

const people = await readCSV(csvData, PersonSchema);

// 2. Complete analysis pipeline
const analysis = people
  // Transform data
  .mutate({
    age_group: (row) => {
      if (row.age < 25) return "Young";
      if (row.age < 35) return "Adult";
      return "Senior";
    },
    score_category: (row) => {
      if (row.score >= 90) return "Excellent";
      if (row.score >= 80) return "Good";
      if (row.score >= 70) return "Fair";
      return "Poor";
    },
    z_score: (row, _index, df) => {
      const mean = s.mean(df.score);
      const std = s.stdev(df.score);
      return s.round((row.score - mean) / std, 3);
    },
  })
  // Filter data
  .filter((row) => row.score >= 80)
  // Group and summarize
  .groupBy("age_group")
  .summarise({
    count: (group) => group.nrows(),
    avg_score: (group) => s.round(s.mean(group.score), 2),
    max_score: (group) => s.max(group.score),
    min_score: (group) => s.min(group.score),
    score_std: (group) => s.round(s.stdev(group.score), 2),
  })
  // Sort results
  .arrange("avg_score", "desc");

analysis.print("Complete analysis results:");

Async Data Enrichment Workflow

Handling asynchronous operations in a real-world data pipeline

This workflow shows how to handle asynchronous operations like API calls and data validation while maintaining type safety and performance.

// Simulate external API calls
async function fetchRegionalBonus(region: string): Promise<number> {
  await new Promise((resolve) => setTimeout(resolve, 1));
  const bonuses = { "North": 1.2, "South": 1.1, "East": 1.15, "West": 1.05 };
  return bonuses[region as keyof typeof bonuses] || 1.0;
}

async function validateDataQuality(score: number): Promise<boolean> {
  await new Promise((resolve) => setTimeout(resolve, 1));
  return score >= 0 && score <= 100;
}

// Sales data
const salesData = createDataFrame([
  { id: 1, region: "North", product: "Widget A", sales: 1000, quarter: "Q1" },
  { id: 2, region: "South", product: "Widget B", sales: 800, quarter: "Q1" },
  { id: 3, region: "East", product: "Widget A", sales: 1200, quarter: "Q1" },
  { id: 4, region: "West", product: "Widget B", sales: 900, quarter: "Q1" },
]);

// Complete async workflow
const enrichedAnalysis = await salesData
  .mutate({
    // Sync operations
    product_category: (row) => row.product.includes("A") ? "Category A" : "Category B",
    
    // Async operations
    regional_bonus: async (row) => await fetchRegionalBonus(row.region),
    is_valid_sale: async (row) => await validateDataQuality(row.sales),
  })
  .filter(async (row) => await row.is_valid_sale)
  .mutate({
    adjusted_sales: (row) => row.sales * row.regional_bonus,
    performance_tier: (row) => {
      if (row.adjusted_sales >= 1200) return "High";
      if (row.adjusted_sales >= 900) return "Medium";
      return "Low";
    },
  })
  .groupBy("region")
  .summarise({
    total_sales: (group) => s.sum(group.sales),
    total_adjusted: (group) => s.sum(group.adjusted_sales),
    avg_bonus: (group) => s.round(s.mean(group.regional_bonus), 3),
    high_performance_count: (group) => 
      group.filter((row) => row.performance_tier === "High").nrows(),
  })
  .arrange("total_adjusted", "desc");

enrichedAnalysis.print("Async enriched analysis:");

// Simulate external API calls
async function fetchRegionalBonus(region: string): Promise<number> {
  await new Promise((resolve) => setTimeout(resolve, 1));
  const bonuses = { "North": 1.2, "South": 1.1, "East": 1.15, "West": 1.05 };
  return bonuses[region as keyof typeof bonuses] || 1.0;
}

async function validateDataQuality(score: number): Promise<boolean> {
  await new Promise((resolve) => setTimeout(resolve, 1));
  return score >= 0 && score <= 100;
}

// Sales data
const salesData = createDataFrame([
  { id: 1, region: "North", product: "Widget A", sales: 1000, quarter: "Q1" },
  { id: 2, region: "South", product: "Widget B", sales: 800, quarter: "Q1" },
  { id: 3, region: "East", product: "Widget A", sales: 1200, quarter: "Q1" },
  { id: 4, region: "West", product: "Widget B", sales: 900, quarter: "Q1" },
]);

// Complete async workflow
const enrichedAnalysis = await salesData
  .mutate({
    // Sync operations
    product_category: (row) => row.product.includes("A") ? "Category A" : "Category B",
    
    // Async operations
    regional_bonus: async (row) => await fetchRegionalBonus(row.region),
    is_valid_sale: async (row) => await validateDataQuality(row.sales),
  })
  .filter(async (row) => await row.is_valid_sale)
  .mutate({
    adjusted_sales: (row) => row.sales * row.regional_bonus,
    performance_tier: (row) => {
      if (row.adjusted_sales >= 1200) return "High";
      if (row.adjusted_sales >= 900) return "Medium";
      return "Low";
    },
  })
  .groupBy("region")
  .summarise({
    total_sales: (group) => s.sum(group.sales),
    total_adjusted: (group) => s.sum(group.adjusted_sales),
    avg_bonus: (group) => s.round(s.mean(group.regional_bonus), 3),
    high_performance_count: (group) => 
      group.filter((row) => row.performance_tier === "High").nrows(),
  })
  .arrange("total_adjusted", "desc");

enrichedAnalysis.print("Async enriched analysis:");

Data Quality and Cleaning Pipeline

Comprehensive data cleaning and quality assessment

This workflow demonstrates comprehensive data quality assessment and cleaning, including missing data analysis and intelligent imputation strategies.

// Messy real-world data
const messyData = createDataFrame([
  { id: 1, name: "Alice", age: 25, score: 85, active: true, notes: "Great" },
  { id: 2, name: null, age: 30, score: null, active: true, notes: "Missing score" },
  { id: 3, name: "Charlie", age: null, score: 92, active: false, notes: null },
  { id: 4, name: "Diana", age: 28, score: NaN, active: null, notes: "NaN score" },
  { id: 5, name: "Eve", age: 35, score: 78, active: true, notes: undefined },
  { id: 6, name: "", age: 0, score: 88, active: false, notes: "Empty string" },
]);

// Data quality assessment
const qualityAnalysis = messyData
  .mutate({
    // Missing data indicators
    name_missing: (row) => row.name === null || row.name === undefined || row.name === "" ? 1 : 0,
    age_missing: (row) => row.age === null || row.age === undefined ? 1 : 0,
    score_missing: (row) => row.score === null || row.score === undefined || isNaN(row.score) ? 1 : 0,
    active_missing: (row) => row.active === null || row.active === undefined ? 1 : 0,
    notes_missing: (row) => row.notes === null || row.notes === undefined ? 1 : 0,
  })
  .summarise({
    total_rows: (df) => df.nrows(),
    name_missing: (df) => s.sum(df.name_missing),
    age_missing: (df) => s.sum(df.age_missing),
    score_missing: (df) => s.sum(df.score_missing),
    active_missing: (df) => s.sum(df.active_missing),
    notes_missing: (df) => s.sum(df.notes_missing),
  })
  .mutate({
    name_missing_pct: (row) => s.round((row.name_missing / row.total_rows) * 100, 1),
    age_missing_pct: (row) => s.round((row.age_missing / row.total_rows) * 100, 1),
    score_missing_pct: (row) => s.round((row.score_missing / row.total_rows) * 100, 1),
    active_missing_pct: (row) => s.round((row.active_missing / row.total_rows) * 100, 1),
    notes_missing_pct: (row) => s.round((row.notes_missing / row.total_rows) * 100, 1),
  });

qualityAnalysis.print("Data quality analysis:");

// Clean the data
const cleanedData = messyData
  .mutate({
    // Smart replacement using statistics
    name_cleaned: (row) => row.name || `Person_${row.id}`,
    age_cleaned: (row) => {
      if (row.age !== null && row.age !== undefined) return row.age;
      return row.active ? 30 : 25; // Different defaults based on other data
    },
    score_cleaned: (row) => {
      if (row.score !== null && row.score !== undefined && !isNaN(row.score)) {
        return row.score;
      }
      // Use median for missing scores
      const validScores = messyData.score.filter(x => x !== null && x !== undefined && !isNaN(x));
      return s.median(validScores, true);
    },
    active_cleaned: (row) => row.active ?? true,
    notes_cleaned: (row) => row.notes || "No additional notes",
  })
  .select("id", "name_cleaned", "age_cleaned", "score_cleaned", "active_cleaned", "notes_cleaned");

cleanedData.print("Cleaned data:");

// Messy real-world data
const messyData = createDataFrame([
  { id: 1, name: "Alice", age: 25, score: 85, active: true, notes: "Great" },
  { id: 2, name: null, age: 30, score: null, active: true, notes: "Missing score" },
  { id: 3, name: "Charlie", age: null, score: 92, active: false, notes: null },
  { id: 4, name: "Diana", age: 28, score: NaN, active: null, notes: "NaN score" },
  { id: 5, name: "Eve", age: 35, score: 78, active: true, notes: undefined },
  { id: 6, name: "", age: 0, score: 88, active: false, notes: "Empty string" },
]);

// Data quality assessment
const qualityAnalysis = messyData
  .mutate({
    // Missing data indicators
    name_missing: (row) => row.name === null || row.name === undefined || row.name === "" ? 1 : 0,
    age_missing: (row) => row.age === null || row.age === undefined ? 1 : 0,
    score_missing: (row) => row.score === null || row.score === undefined || isNaN(row.score) ? 1 : 0,
    active_missing: (row) => row.active === null || row.active === undefined ? 1 : 0,
    notes_missing: (row) => row.notes === null || row.notes === undefined ? 1 : 0,
  })
  .summarise({
    total_rows: (df) => df.nrows(),
    name_missing: (df) => s.sum(df.name_missing),
    age_missing: (df) => s.sum(df.age_missing),
    score_missing: (df) => s.sum(df.score_missing),
    active_missing: (df) => s.sum(df.active_missing),
    notes_missing: (df) => s.sum(df.notes_missing),
  })
  .mutate({
    name_missing_pct: (row) => s.round((row.name_missing / row.total_rows) * 100, 1),
    age_missing_pct: (row) => s.round((row.age_missing / row.total_rows) * 100, 1),
    score_missing_pct: (row) => s.round((row.score_missing / row.total_rows) * 100, 1),
    active_missing_pct: (row) => s.round((row.active_missing / row.total_rows) * 100, 1),
    notes_missing_pct: (row) => s.round((row.notes_missing / row.total_rows) * 100, 1),
  });

qualityAnalysis.print("Data quality analysis:");

// Clean the data
const cleanedData = messyData
  .mutate({
    // Smart replacement using statistics
    name_cleaned: (row) => row.name || `Person_${row.id}`,
    age_cleaned: (row) => {
      if (row.age !== null && row.age !== undefined) return row.age;
      return row.active ? 30 : 25; // Different defaults based on other data
    },
    score_cleaned: (row) => {
      if (row.score !== null && row.score !== undefined && !isNaN(row.score)) {
        return row.score;
      }
      // Use median for missing scores
      const validScores = messyData.score.filter(x => x !== null && x !== undefined && !isNaN(x));
      return s.median(validScores, true);
    },
    active_cleaned: (row) => row.active ?? true,
    notes_cleaned: (row) => row.notes || "No additional notes",
  })
  .select("id", "name_cleaned", "age_cleaned", "score_cleaned", "active_cleaned", "notes_cleaned");

cleanedData.print("Cleaned data:");

Advanced Joining and Reshaping

Complex data integration and reshaping workflows

Multi-Table Analysis

// Employee and department data
const employees = createDataFrame([
  { emp_id: 1, name: "Alice", dept_id: 10, year: 2023, salary: 50000 },
  { emp_id: 2, name: "Bob", dept_id: 20, year: 2023, salary: 60000 },
  { emp_id: 3, name: "Charlie", dept_id: 10, year: 2024, salary: 55000 },
  { emp_id: 4, name: "Diana", dept_id: 30, year: 2023, salary: 70000 },
]);

const departments = createDataFrame([
  { dept_id: 10, year: 2023, dept_name: "Engineering", manager: "John" },
  { dept_id: 20, year: 2023, dept_name: "Marketing", manager: "Jane" },
  { dept_id: 10, year: 2024, dept_name: "Engineering", manager: "Sarah" },
]);

// Multi-key join and analysis
const departmentAnalysis = employees
  .innerJoin(departments, ["dept_id", "year"])
  .mutate({
    salary_rank: (row, _index, df) => s.rank(df.salary, row.salary),
    salary_percentile: (row, _index, df) => s.percentileRank(df.salary, row.salary),
  })
  .groupBy("dept_name")
  .summarise({
    employee_count: (group) => group.nrows(),
    avg_salary: (group) => s.round(s.mean(group.salary), 0),
    median_salary: (group) => s.median(group.salary),
    salary_range: (group) => `${s.min(group.salary)}-${s.max(group.salary)}`,
    top_earner: (group) => {
      const maxSalary = s.max(group.salary);
      return group.find(row => row.salary === maxSalary)?.name || "N/A";
    },
  })
  .arrange("avg_salary", "desc");

departmentAnalysis.print("Department analysis:");

// Employee and department data
const employees = createDataFrame([
  { emp_id: 1, name: "Alice", dept_id: 10, year: 2023, salary: 50000 },
  { emp_id: 2, name: "Bob", dept_id: 20, year: 2023, salary: 60000 },
  { emp_id: 3, name: "Charlie", dept_id: 10, year: 2024, salary: 55000 },
  { emp_id: 4, name: "Diana", dept_id: 30, year: 2023, salary: 70000 },
]);

const departments = createDataFrame([
  { dept_id: 10, year: 2023, dept_name: "Engineering", manager: "John" },
  { dept_id: 20, year: 2023, dept_name: "Marketing", manager: "Jane" },
  { dept_id: 10, year: 2024, dept_name: "Engineering", manager: "Sarah" },
]);

// Multi-key join and analysis
const departmentAnalysis = employees
  .innerJoin(departments, ["dept_id", "year"])
  .mutate({
    salary_rank: (row, _index, df) => s.rank(df.salary, row.salary),
    salary_percentile: (row, _index, df) => s.percentileRank(df.salary, row.salary),
  })
  .groupBy("dept_name")
  .summarise({
    employee_count: (group) => group.nrows(),
    avg_salary: (group) => s.round(s.mean(group.salary), 0),
    median_salary: (group) => s.median(group.salary),
    salary_range: (group) => `${s.min(group.salary)}-${s.max(group.salary)}`,
    top_earner: (group) => {
      const maxSalary = s.max(group.salary);
      return group.find(row => row.salary === maxSalary)?.name || "N/A";
    },
  })
  .arrange("avg_salary", "desc");

departmentAnalysis.print("Department analysis:");

Data Reshaping and Pivoting

// Sales data in long format
const salesLong = createDataFrame([
  { year: 2023, quarter: "Q1", product: "Widget A", sales: 1000 },
  { year: 2023, quarter: "Q1", product: "Widget B", sales: 1500 },
  { year: 2023, quarter: "Q2", product: "Widget A", sales: 1200 },
  { year: 2023, quarter: "Q2", product: "Widget B", sales: 1800 },
  { year: 2024, quarter: "Q1", product: "Widget A", sales: 1100 },
  { year: 2024, quarter: "Q1", product: "Widget B", sales: 1600 },
]);

// Pivot to wide format and analyze
const pivotAnalysis = salesLong
  .pivotWider({
    names_from: "product",
    values_from: "sales",
    expected_columns: ["Widget A", "Widget B"],
  })
  .mutate({
    total_sales: (row) => row["Widget A"] + row["Widget B"],
    widget_a_share: (row) => s.round((row["Widget A"] / row.total_sales) * 100, 1),
    widget_b_share: (row) => s.round((row["Widget B"] / row.total_sales) * 100, 1),
    growth_rate: (row, index, df) => {
      if (index === 0) return 0;
      const prevRow = df[index - 1];
      return s.round(((row.total_sales - prevRow.total_sales) / prevRow.total_sales) * 100, 1);
    },
  })
  .arrange("year", "quarter");

pivotAnalysis.print("Pivot analysis with market share:");

// Sales data in long format
const salesLong = createDataFrame([
  { year: 2023, quarter: "Q1", product: "Widget A", sales: 1000 },
  { year: 2023, quarter: "Q1", product: "Widget B", sales: 1500 },
  { year: 2023, quarter: "Q2", product: "Widget A", sales: 1200 },
  { year: 2023, quarter: "Q2", product: "Widget B", sales: 1800 },
  { year: 2024, quarter: "Q1", product: "Widget A", sales: 1100 },
  { year: 2024, quarter: "Q1", product: "Widget B", sales: 1600 },
]);

// Pivot to wide format and analyze
const pivotAnalysis = salesLong
  .pivotWider({
    names_from: "product",
    values_from: "sales",
    expected_columns: ["Widget A", "Widget B"],
  })
  .mutate({
    total_sales: (row) => row["Widget A"] + row["Widget B"],
    widget_a_share: (row) => s.round((row["Widget A"] / row.total_sales) * 100, 1),
    widget_b_share: (row) => s.round((row["Widget B"] / row.total_sales) * 100, 1),
    growth_rate: (row, index, df) => {
      if (index === 0) return 0;
      const prevRow = df[index - 1];
      return s.round(((row.total_sales - prevRow.total_sales) / prevRow.total_sales) * 100, 1);
    },
  })
  .arrange("year", "quarter");

pivotAnalysis.print("Pivot analysis with market share:");