Real-world examples demonstrating T’s capabilities for data analysis and manipulation.
-- Load customer data
customers = read_csv("customers.csv", clean_colnames = true)
-- Inspect structure
nrow(customers) -- 1000
ncol(customers) -- 5
colnames(customers) -- ["name", "age", "city", "purchases", "active"]
-- View sample
head(customers.age, 10)
-- Summary statistics
mean_age = mean(customers.age, na_rm = true)
sd_age = sd(customers.age, na_rm = true)
median_purchases = quantile(customers.purchases, 0.5, na_rm = true)
print("Average age: " + string(mean_age))
print("Age SD: " + string(sd_age))
print("Median purchases: " + string(median_purchases))
-- Distribution analysis
q25 = quantile(customers.purchases, 0.25, na_rm = true)
q75 = quantile(customers.purchases, 0.75, na_rm = true)
iqr = q75 - q25
print("IQR: " + string(iqr))
-- Active customers only
active = customers |> filter(\(row) row.active == true)
-- High-value customers
high_value = customers
|> filter(\(row) row.purchases > 100)
|> select("name", "purchases")
-- Age groups
young = customers |> filter(\(row) row.age < 30)
mature = customers |> filter(\(row) row.age >= 30 and row.age < 50)
senior = customers |> filter(\(row) row.age >= 50)
print("Young customers: " + string(nrow(young)))
print("Mature customers: " + string(nrow(mature)))
print("Senior customers: " + string(nrow(senior)))
-- Check for NA
sales = read_csv("sales.csv")
-- Count NA per column
na_count_revenue = length(filter(sales.revenue, \(x) is_na(x)))
na_count_cost = length(filter(sales.cost, \(x) is_na(x)))
print("NA in revenue: " + string(na_count_revenue))
print("NA in cost: " + string(na_count_cost))
-- Remove rows with any NA
clean = sales |> filter(\(row)
not is_na(row.revenue) and not is_na(row.cost)
)
-- Or use na_rm in calculations
total_revenue = sum(sales.revenue, na_rm = true)
avg_revenue = mean(sales.revenue, na_rm = true)
-- Messy column names
raw = read_csv("messy_data.csv")
colnames(raw)
-- ["Product ID", "Sales (€)", "Growth%", "2023 Total"]
-- Clean names
clean = read_csv("messy_data.csv", clean_colnames = true)
colnames(clean)
-- ["product_id", "sales_euro", "growth_percent", "x_2023_total"]
-- Now accessible with dot notation
avg_sales = mean(clean.sales_euro, na_rm = true)
-- Assuming string to number conversion functions exist
-- (In T, types are inferred from CSV, but you can transform)
-- Create derived columns with correct types
data = data |> mutate("price_num", \(row) parse_float(row.price_str))
-- Boolean flags
data = data |> mutate("is_premium", \(row) row.price > 100)
-- Categorical to numeric
data = data |> mutate("region_code", \(row)
if (row.region == "North") 1
else if (row.region == "South") 2
else if (row.region == "East") 3
else 4
)
-- Calculate bounds
values = df.salary
q25 = quantile(values, 0.25, na_rm = true)
q75 = quantile(values, 0.75, na_rm = true)
iqr = q75 - q25
lower_bound = q25 - 1.5 * iqr
upper_bound = q75 + 1.5 * iqr
-- Remove outliers
cleaned = df |> filter(\(row)
row.salary >= lower_bound and row.salary <= upper_bound
)
print("Original rows: " + string(nrow(df)))
print("After outlier removal: " + string(nrow(cleaned)))
-- Load data
economics = read_csv("economics.csv")
-- Compute correlation
cor_gdp_unemployment = cor(
economics.gdp,
economics.unemployment_rate,
na_rm = true
)
print("GDP vs Unemployment: r = " + string(cor_gdp_unemployment))
-- Multiple correlations
cor_consumption_income = cor(economics.consumption, economics.income, na_rm = true)
cor_savings_income = cor(economics.savings, economics.income, na_rm = true)
print("Consumption vs Income: r = " + string(cor_consumption_income))
print("Savings vs Income: r = " + string(cor_savings_income))
-- Simple regression
advertising = read_csv("advertising.csv")
model = lm(
data = advertising,
formula = sales ~ ad_spend
)
-- Inspect model
print("Slope: " + string(model.slope))
print("Intercept: " + string(model.intercept))
print("R²: " + string(model.r_squared))
print("N: " + string(model.n))
-- Manual prediction
predict = \(x) model.intercept + model.slope * x
predicted_sales = predict(5000)
print("Predicted sales for $5000 ad spend: " + string(predicted_sales))
-- Compare two groups
group_a = df |> filter(\(row) row.treatment == "A") |> \(d) d.outcome
group_b = df |> filter(\(row) row.treatment == "B") |> \(d) d.outcome
-- Calculate means and SDs
mean_a = mean(group_a, na_rm = true)
mean_b = mean(group_b, na_rm = true)
sd_a = sd(group_a, na_rm = true)
sd_b = sd(group_b, na_rm = true)
-- Effect size (Cohen's d)
pooled_sd = sqrt((sd_a * sd_a + sd_b * sd_b) / 2)
cohens_d = (mean_a - mean_b) / pooled_sd
print("Group A mean: " + string(mean_a))
print("Group B mean: " + string(mean_b))
print("Cohen's d: " + string(cohens_d))
sales = read_csv("sales_data.csv")
-- Group by region, count rows
by_region = sales
|> group_by("region")
|> summarize("count", \(g) nrow(g))
print(by_region)
-- Multiple aggregations
by_product = sales
|> group_by("product")
|> summarize("total_revenue", \(g) sum(g.revenue, na_rm = true))
by_product = by_product
|> mutate("avg_price", \(row) row.total_revenue / row.count)
employees = read_csv("employees.csv")
-- Average salary by department
dept_stats = employees
|> group_by("department")
|> summarize("avg_salary", \(g) mean(g.salary, na_rm = true))
-- Multiple groups
regional_dept_stats = employees
|> group_by("region", "department")
|> summarize("avg_salary", \(g) mean(g.salary, na_rm = true))
|> arrange("avg_salary", "desc")
print(regional_dept_stats)
-- Add department size to each row
employees_with_size = employees
|> group_by("department")
|> mutate("dept_size", \(g) nrow(g))
-- Add group mean to each row
employees_with_avg = employees
|> group_by("department")
|> mutate("dept_avg_salary", \(g) mean(g.salary, na_rm = true))
-- Calculate z-score within group
employees_normalized = employees_with_avg
|> group_by("department")
|> mutate("salary_z", \(g)
(g.salary - mean(g.salary, na_rm = true)) / sd(g.salary, na_rm = true)
)
-- Load sales data
sales = read_csv("sales.csv")
-- Add row numbers
sales_ranked = sales
|> mutate("row_num", \(row) row_number(sales.revenue))
-- Rank by revenue
sales_ranked = sales_ranked
|> mutate("revenue_rank", \(row) min_rank(sales.revenue))
|> arrange("revenue_rank")
-- Dense rank (no gaps)
sales_ranked = sales_ranked
|> mutate("dense_revenue_rank", \(row) dense_rank(sales.revenue))
-- Percentiles
sales_ranked = sales_ranked
|> mutate("percentile", \(row) percent_rank(sales.revenue))
print(sales_ranked)
-- Time series data
ts = read_csv("time_series.csv")
-- Previous value
ts_with_lag = ts
|> mutate("prev_value", \(row) lag(ts.value))
-- Next value
ts_with_lead = ts
|> mutate("next_value", \(row) lead(ts.value))
-- Calculate change
ts_with_change = ts_with_lag
|> mutate("change", \(row) row.value - row.prev_value)
-- Multi-period lag
ts_with_lag2 = ts
|> mutate("value_2_periods_ago", \(row) lag(ts.value, 2))
-- Daily sales
daily = read_csv("daily_sales.csv")
-- Cumulative sum
daily_cumsum = daily
|> mutate("cumulative_sales", \(row) cumsum(daily.sales))
-- Running average
daily_cumavg = daily
|> mutate("running_avg", \(row) cummean(daily.sales))
-- Cumulative max
daily_cummax = daily
|> mutate("peak_so_far", \(row) cummax(daily.sales))
-- Cumulative min
daily_cummin = daily
|> mutate("lowest_so_far", \(row) cummin(daily.sales))
print(daily_cumsum)
intent {
description: "Customer lifetime value analysis",
assumes: "Data from 2023-01-01 to 2023-12-31",
requires: "All monetary values in USD"
}
analysis = pipeline {
-- Load raw data
raw = read_csv("transactions.csv", clean_colnames = true)
-- Clean data
cleaned = raw
|> filter(\(row) not is_na(row.amount) and row.amount > 0)
|> filter(\(row) row.date >= "2023-01-01" and row.date <= "2023-12-31")
-- Customer aggregations
customer_stats = cleaned
|> group_by("customer_id")
|> summarize("total_spend", \(g) sum(g.amount))
|> summarize("purchase_count", \(g) nrow(g))
|> mutate("avg_order_value", \(row) row.total_spend / row.purchase_count)
-- Segment customers
segments = customer_stats
|> mutate("segment", \(row)
if (row.total_spend > 1000) "high_value"
else if (row.total_spend > 500) "medium_value"
else "low_value"
)
-- Summary by segment
segment_summary = segments
|> group_by("segment")
|> summarize("customer_count", \(g) nrow(g))
|> summarize("avg_ltv", \(g) mean(g.total_spend))
|> arrange("avg_ltv", "desc")
}
-- Access results
print(analysis.segment_summary)
write_csv(analysis.segments, "customer_segments.csv")
etl = pipeline {
-- Load multiple sources
sales = read_csv("sales.csv")
customers = read_csv("customers.csv")
products = read_csv("products.csv")
-- Clean each source
sales_clean = sales |> filter(\(row) row.amount > 0)
-- Aggregate sales by customer
customer_sales = sales_clean
|> group_by("customer_id")
|> summarize("total_sales", \(g) sum(g.amount))
-- Combine with customer data (manual join via filter/map)
-- (Assuming a join function exists or using manual matching)
-- Final report
report = customer_sales
|> arrange("total_sales", "desc")
}
write_csv(etl.report, "sales_report.csv")
-- Check for empty data
df = read_csv("data.csv")
safe_mean = if (nrow(df) == 0)
error("ValueError", "Empty dataset")
else
mean(df.value, na_rm = true)
-- Handle errors in pipeline
result = safe_mean ?|> \(x)
if (is_error(x)) {
print("Error: " + error_message(x))
0.0 -- Default value
} else {
x
}
validate = pipeline {
raw = read_csv("input.csv")
-- Validation checks
check_rows = if (nrow(raw) == 0)
error("ValidationError", "No data found")
else
raw
check_columns = if (ncol(check_rows) < 3)
error("ValidationError", "Insufficient columns")
else
check_rows
check_values = check_columns |> filter(\(row)
not is_na(row.value) and row.value > 0
)
-- If validation passes, proceed
processed = if (nrow(check_values) > 100)
check_values
else
error("ValidationError", "Too few valid rows")
}
-- Use maybe-pipe for error recovery
final = validate.processed ?|> \(x)
if (is_error(x)) {
print("Validation failed: " + error_message(x))
-- Return empty placeholder or default
null
} else {
x
}
-- Try to load data, fall back if fails
data = read_csv("primary.csv") ?|> \(x)
if (is_error(x)) {
print("Primary source failed, trying backup...")
read_csv("backup.csv")
} else {
x
}
-- Chain multiple fallbacks
robust_load = read_csv("source1.csv")
?|> \(x) if (is_error(x)) read_csv("source2.csv") else x
?|> \(x) if (is_error(x)) read_csv("source3.csv") else x
?|> \(x) if (is_error(x)) error("AllSourcesFailed", "No data available") else x
intent {
description: "Identify revenue drivers for Q4 2023",
goal: "Determine which product categories drive revenue growth",
assumes: "Complete sales data for Q4 2023",
constraints: "Exclude returns and refunds",
success_criteria: "R² > 0.7 for predictive model",
stakeholder: "Product team"
}
-- Analysis follows intent
q4_sales = read_csv("q4_2023_sales.csv")
analysis = pipeline {
clean = q4_sales
|> filter(\(row) row.type != "return" and row.type != "refund")
by_category = clean
|> group_by("category")
|> summarize("total_revenue", \(g) sum(g.revenue))
|> arrange("total_revenue", "desc")
top_categories = by_category
|> filter(\(row) row.total_revenue > 10000)
}
print(analysis.top_categories)
intent {
task: "customer_segmentation",
description: "Segment customers by behavior for targeted marketing",
inputs: {
data_source: "customers.csv",
date_range: "2023-01-01 to 2023-12-31",
required_fields: ["customer_id", "purchase_frequency", "total_spend"]
},
methodology: "RFM analysis (Recency, Frequency, Monetary)",
outputs: {
segments: ["champions", "loyal", "at_risk", "churned"],
format: "CSV with segment labels"
},
validation: {
min_customers_per_segment: 50,
total_customers_check: "should match input count"
}
}
-- LLM generates implementation following intent structure
-- (This serves as a contract for regeneration)
-- Intent blocks create audit trail for LLM-assisted work
intent {
version: "1.0",
author: "Data Team",
llm_assistant: "GPT-4",
created: "2024-01-15",
last_modified: "2024-01-20",
change_log: [
{date: "2024-01-15", change: "Initial implementation"},
{date: "2024-01-18", change: "Added NA handling"},
{date: "2024-01-20", change: "Optimized grouping logic"}
],
human_review: true,
human_reviewer: "Alice Smith",
review_date: "2024-01-21"
}
-- Rest of analysis...
-- Complete workflow: load, clean, analyze, export
intent {
description: "Prepare sales data for dashboard",
frequency: "Daily",
outputs: ["summary_stats.csv", "top_products.csv", "regional_breakdown.csv"]
}
dashboard_prep = pipeline {
-- 1. Load data
raw_sales = read_csv("daily_sales.csv", clean_colnames = true)
-- 2. Data quality
clean_sales = raw_sales
|> filter(\(row) not is_na(row.revenue) and row.revenue > 0)
|> filter(\(row) not is_na(row.product_id))
-- 3. Summary statistics
summary = {
total_revenue: sum(clean_sales.revenue),
total_orders: nrow(clean_sales),
avg_order_value: mean(clean_sales.revenue),
median_order: quantile(clean_sales.revenue, 0.5)
}
-- 4. Top products
top_products = clean_sales
|> group_by("product_name")
|> summarize("revenue", \(g) sum(g.revenue))
|> arrange("revenue", "desc")
-- 5. Regional breakdown
by_region = clean_sales
|> group_by("region")
|> summarize("revenue", \(g) sum(g.revenue))
|> summarize("orders", \(g) nrow(g))
|> mutate("avg_order_value", \(row) row.revenue / row.orders)
|> arrange("revenue", "desc")
}
-- Export results
write_csv(dashboard_prep.top_products, "top_products.csv")
write_csv(dashboard_prep.by_region, "regional_breakdown.csv")
print("Dashboard data prepared successfully")
print("Total revenue: " + string(dashboard_prep.summary.total_revenue))
See Also: - API Reference — Function documentation - Data Manipulation — More data verb examples - Pipeline Tutorial — Detailed pipeline guide