Descriptive Analysis

Summary statistics, visualization, and publication-ready tables

Summary Statistics

Basic Summary Statistics

* Basic summary
summarize income age education

* Detailed summary with percentiles
summarize income, detail

* Summary by group
bysort treatment: summarize outcome

library(dplyr)

# Basic summary
summary(data[, c("income", "age", "education")])

# Detailed summary
data %>%
  summarize(
    mean = mean(income, na.rm = TRUE),
    sd = sd(income, na.rm = TRUE),
    min = min(income, na.rm = TRUE),
    p25 = quantile(income, 0.25, na.rm = TRUE),
    median = median(income, na.rm = TRUE),
    p75 = quantile(income, 0.75, na.rm = TRUE),
    max = max(income, na.rm = TRUE)
  )

# Summary by group
data %>%
  group_by(treatment) %>%
  summarize(
    mean_outcome = mean(outcome, na.rm = TRUE),
    sd_outcome = sd(outcome, na.rm = TRUE),
    n = n()
  )

import pandas as pd

# Basic summary
data[["income", "age", "education"]].describe()

# Detailed summary with percentiles
data["income"].describe(percentiles=[.25, .5, .75])

# Summary by group
data.groupby("treatment")["outcome"].agg(["mean", "std", "count"])

Tabulations and Cross-tabs

* One-way tabulation
tab education

* Two-way tabulation
tab education treatment

* With row/column percentages
tab education treatment, row col

* Tabulate continuous variable
tabstat income, by(education) stat(mean sd n)

# One-way tabulation
table(data$education)

# Two-way tabulation
table(data$education, data$treatment)

# With proportions
prop.table(table(data$education, data$treatment), margin = 1)  # row %
prop.table(table(data$education, data$treatment), margin = 2)  # col %

# Summarize continuous by group
data %>%
  group_by(education) %>%
  summarize(
    mean_income = mean(income, na.rm = TRUE),
    sd_income = sd(income, na.rm = TRUE),
    n = n()
  )

# One-way tabulation
data["education"].value_counts()

# Two-way tabulation (cross-tab)
pd.crosstab(data["education"], data["treatment"])

# With row/column percentages
pd.crosstab(data["education"], data["treatment"], normalize="index")  # row %
pd.crosstab(data["education"], data["treatment"], normalize="columns")  # col %

# Summarize continuous by group
data.groupby("education")["income"].agg(["mean", "std", "count"])

Balance Tables

A balance table compares characteristics across treatment and control groups. This is essential for any study with treatment/control design.

* Install if needed: ssc install balancetable

* Create balance table
balancetable treatment age female income education ///
    using "balance_table.tex", ///
    ctitles("Control" "Treatment" "Difference") ///
    varlabels replace

* Manual approach with estout
estpost ttest age female income education, by(treatment)
esttab using "balance.tex", ///
    cells("mu_1 mu_2 b se") ///
    label replace

library(modelsummary)

# Create balance table
datasummary_balance(
  ~ treatment,
  data = data,
  fmt = 2,
  dinm_statistic = "p.value"
)

# Export to file
datasummary_balance(
  ~ treatment,
  data = data,
  output = "balance_table.tex"
)

from scipy import stats

# Create balance table manually
vars = ["age", "female", "income", "education"]
balance = []
for var in vars:
    control = data.loc[data["treatment"] == 0, var]
    treated = data.loc[data["treatment"] == 1, var]
    tstat, pval = stats.ttest_ind(control, treated, nan_policy="omit")
    balance.append({
        "Variable": var,
        "Control": control.mean(),
        "Treatment": treated.mean(),
        "Difference": treated.mean() - control.mean(),
        "p-value": pval
    })
balance_df = pd.DataFrame(balance)
print(balance_df.to_string(index=False))

Data Visualization

Graphs Should Stand Alone

Every figure should be understandable without reading the paper. Include clear axis labels, legends, and titles. Use notes to explain any non-obvious details.

Histograms and Density Plots

* Histogram
histogram income, frequency ///
    title("Distribution of Income") ///
    xtitle("Annual Income ($)") ytitle("Frequency")
graph export "hist_income.png", replace

* Kernel density
kdensity income, ///
    title("Income Distribution") ///
    xtitle("Annual Income ($)")

* Overlapping densities by group
twoway (kdensity income if treatment == 0, lcolor(blue)) ///
       (kdensity income if treatment == 1, lcolor(red)), ///
    legend(label(1 "Control") label(2 "Treatment")) ///
    title("Income by Treatment Status")

library(ggplot2)

# Histogram
ggplot(data, aes(x = income)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Income",
       x = "Annual Income ($)", y = "Frequency") +
  theme_minimal()
ggsave("hist_income.png", width = 8, height = 6)

# Kernel density
ggplot(data, aes(x = income)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Income Distribution", x = "Annual Income ($)")

# Overlapping densities by group
ggplot(data, aes(x = income, fill = factor(treatment))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("blue", "red"),
                    labels = c("Control", "Treatment")) +
  labs(title = "Income by Treatment Status", fill = "Group")

import matplotlib.pyplot as plt
import seaborn as sns

# Histogram
plt.figure(figsize=(8, 6))
plt.hist(data["income"].dropna(), bins=30, color="steelblue", edgecolor="white")
plt.title("Distribution of Income")
plt.xlabel("Annual Income ($)")
plt.ylabel("Frequency")
plt.savefig("hist_income.png")

# Kernel density
data["income"].plot(kind="kde", title="Income Distribution")

# Overlapping densities by group
fig, ax = plt.subplots()
for treat, color, label in [(0, "blue", "Control"), (1, "red", "Treatment")]:
    data.loc[data["treatment"] == treat, "income"].plot(
        kind="kde", ax=ax, color=color, label=label
    )
ax.legend()
ax.set_title("Income by Treatment Status")

Scatter Plots

* Basic scatter plot
scatter income education, ///
    title("Income vs. Education") ///
    xtitle("Years of Education") ytitle("Income")

* With fitted line
twoway (scatter income education) ///
       (lfit income education), ///
    legend(off) ///
    title("Income vs. Education")

* Binned scatter plot (binscatter)
* Install: ssc install binscatter
binscatter income education, ///
    title("Income vs. Education") ///
    xtitle("Years of Education") ytitle("Mean Income")

# Basic scatter plot
ggplot(data, aes(x = education, y = income)) +
  geom_point(alpha = 0.5) +
  labs(title = "Income vs. Education",
       x = "Years of Education", y = "Income")

# With fitted line
ggplot(data, aes(x = education, y = income)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Income vs. Education")

# Binned scatter plot
library(binsreg)
binsreg(y = data$income, x = data$education)

# Basic scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(data["education"], data["income"], alpha=0.5)
plt.title("Income vs. Education")
plt.xlabel("Years of Education")
plt.ylabel("Income")

# With fitted line
import numpy as np
z = np.polyfit(data["education"], data["income"], 1)
p = np.poly1d(z)
plt.scatter(data["education"], data["income"], alpha=0.5)
plt.plot(data["education"].sort_values(), p(data["education"].sort_values()), "r-")

# Seaborn regression plot (easier)
sns.regplot(x="education", y="income", data=data)

Time Series and Line Plots

* Collapse to year-level means
preserve
collapse (mean) income, by(year)

* Line plot
twoway line income year, ///
    title("Average Income Over Time") ///
    xtitle("Year") ytitle("Mean Income")
restore

* Multiple lines by group
preserve
collapse (mean) income, by(year treatment)
twoway (line income year if treatment == 0) ///
       (line income year if treatment == 1), ///
    legend(label(1 "Control") label(2 "Treatment"))
restore

# Aggregate to year level
yearly <- data %>%
  group_by(year) %>%
  summarize(mean_income = mean(income, na.rm = TRUE))

# Line plot
ggplot(yearly, aes(x = year, y = mean_income)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(title = "Average Income Over Time",
       x = "Year", y = "Mean Income")

# Multiple lines by group
yearly_by_group <- data %>%
  group_by(year, treatment) %>%
  summarize(mean_income = mean(income, na.rm = TRUE))

ggplot(yearly_by_group, aes(x = year, y = mean_income,
                            color = factor(treatment))) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("blue", "red"),
                     labels = c("Control", "Treatment")) +
  labs(color = "Group")

# Aggregate to year level
yearly = data.groupby("year")["income"].mean()

# Line plot
plt.figure(figsize=(8, 6))
plt.plot(yearly.index, yearly.values, marker="o")
plt.title("Average Income Over Time")
plt.xlabel("Year")
plt.ylabel("Mean Income")

# Multiple lines by group
yearly_by_group = data.groupby(["year", "treatment"])["income"].mean().unstack()
yearly_by_group.plot(marker="o")
plt.legend(["Control", "Treatment"])

Box Plots

* Box plot by group
graph box income, over(education) ///
    title("Income Distribution by Education Level")

# Box plot by group
ggplot(data, aes(x = factor(education), y = income)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  labs(title = "Income Distribution by Education Level",
       x = "Education Level", y = "Income")

# Box plot by group
data.boxplot(column="income", by="education")
plt.title("Income Distribution by Education Level")
plt.suptitle("")  # Remove automatic title
plt.xlabel("Education Level")
plt.ylabel("Income")

# Or with seaborn
sns.boxplot(x="education", y="income", data=data)

Publication Tables

Never Copy Numbers by Hand

Export tables directly from your code to LaTeX or Word. Manual copying introduces errors and makes your work non-reproducible. Use estout/esttab in Stata or modelsummary in R.

Summary Statistics Tables

* Install estout if needed: ssc install estout

* Create summary statistics table
estpost summarize income age education female

* Export to LaTeX
esttab using "summary_stats.tex", ///
    cells("mean(fmt(2)) sd(fmt(2)) min max count") ///
    label noobs replace ///
    title("Summary Statistics")

* Export to Word/RTF
esttab using "summary_stats.rtf", ///
    cells("mean(fmt(2)) sd(fmt(2)) min max count") ///
    label noobs replace

library(modelsummary)

# Summary statistics table
datasummary(
  income + age + education + female ~ Mean + SD + Min + Max + N,
  data = data,
  fmt = 2
)

# Export to LaTeX
datasummary(
  income + age + education + female ~ Mean + SD + Min + Max + N,
  data = data,
  output = "summary_stats.tex"
)

# Export to Word
datasummary(
  income + age + education + female ~ Mean + SD + Min + Max + N,
  data = data,
  output = "summary_stats.docx"
)

# Summary statistics table
summary = data[["income", "age", "education", "female"]].describe().T
summary = summary[["mean", "std", "min", "max", "count"]]
print(summary.round(2))

# Export to LaTeX
print(summary.to_latex(float_format="%.2f"))

# Save to file
summary.to_latex("summary_stats.tex", float_format="%.2f")

# Export to Excel (for Word)
summary.to_excel("summary_stats.xlsx")

Regression Tables

* Run regressions and store results
eststo clear
eststo: reg income treatment
eststo: reg income treatment age female
eststo: reg income treatment age female education

* Create table
esttab using "regression_table.tex", ///
    se star(* 0.10 ** 0.05 *** 0.01) ///
    label r2 ///
    title("Effect of Treatment on Income") ///
    mtitles("(1)" "(2)" "(3)") ///
    addnotes("Standard errors in parentheses") ///
    replace

* Common options:
*   se           - show standard errors (not t-stats)
*   star(...)    - significance stars
*   label        - use variable labels
*   r2           - show R-squared
*   b(3) se(3)   - 3 decimal places
*   drop(...)    - hide certain coefficients

library(modelsummary)
library(fixest)

# Run regressions
m1 <- lm(income ~ treatment, data = data)
m2 <- lm(income ~ treatment + age + female, data = data)
m3 <- lm(income ~ treatment + age + female + education, data = data)

# Create table
modelsummary(
  list("(1)" = m1, "(2)" = m2, "(3)" = m3),
  stars = c('*' = 0.10, '**' = 0.05, '***' = 0.01),
  gof_omit = "AIC|BIC|Log",
  title = "Effect of Treatment on Income",
  notes = "Standard errors in parentheses"
)

# Export to LaTeX
modelsummary(
  list("(1)" = m1, "(2)" = m2, "(3)" = m3),
  stars = c('*' = 0.10, '**' = 0.05, '***' = 0.01),
  output = "regression_table.tex"
)

# Export to Word
modelsummary(
  list("(1)" = m1, "(2)" = m2, "(3)" = m3),
  output = "regression_table.docx"
)

import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

# Run regressions
m1 = smf.ols("income ~ treatment", data=data).fit()
m2 = smf.ols("income ~ treatment + age + female", data=data).fit()
m3 = smf.ols("income ~ treatment + age + female + education", data=data).fit()

# Create table with stargazer
stargazer = Stargazer([m1, m2, m3])
stargazer.title("Effect of Treatment on Income")
print(stargazer.render_latex())

# Or manual approach
from statsmodels.iolib.summary2 import summary_col
print(summary_col([m1, m2, m3], stars=True).as_latex())

Customizing Tables

* Add mean of dependent variable
eststo clear
eststo: reg income treatment age female
    estadd local controls "Yes"
    sum income if e(sample)
    estadd scalar mean_y = r(mean)

* Include in table
esttab using "table.tex", ///
    se star(* 0.10 ** 0.05 *** 0.01) ///
    scalars("mean_y Mean of Dep. Var." "controls Controls") ///
    label replace

* Rename coefficients in output
esttab, rename(treatment "Treatment Effect")

# Add rows for controls and mean of DV
rows <- tribble(
  ~term, ~"(1)", ~"(2)",
  "Controls", "No", "Yes",
  "Mean of DV", sprintf("%.2f", mean(data$income)),
                sprintf("%.2f", mean(data$income))
)

modelsummary(
  list("(1)" = m1, "(2)" = m2),
  add_rows = rows,
  coef_rename = c("treatment" = "Treatment Effect"),
  stars = TRUE
)

# Add custom rows with stargazer
stargazer = Stargazer([m1, m2])
stargazer.add_line("Controls", ["No", "Yes"])
stargazer.add_line("Mean of DV", [f"{data['income'].mean():.2f}"] * 2)
stargazer.rename_covariates({"treatment": "Treatment Effect"})
print(stargazer.render_latex())

Table Checklist

Do all columns have clear headers?
Are standard errors in parentheses (not t-stats)?
Is the dependent variable clearly labeled?
Are significance levels defined in a note?
Is sample size reported for each column?
Are controls indicated (even if not shown)?

← Data Fundamentals Next: Applied Micro →