Descriptive Analysis
Summary statistics, visualization, and publication-ready tables
Summary Statistics
Basic Summary Statistics
* Basic summary
summarize income age education
* Detailed summary with percentiles
summarize income, detail
* Summary by group
bysort treatment: summarize outcome
library(dplyr)
# Basic summary
summary(data[, c("income", "age", "education")])
# Detailed summary
data %>%
summarize(
mean = mean(income, na.rm = TRUE),
sd = sd(income, na.rm = TRUE),
min = min(income, na.rm = TRUE),
p25 = quantile(income, 0.25, na.rm = TRUE),
median = median(income, na.rm = TRUE),
p75 = quantile(income, 0.75, na.rm = TRUE),
max = max(income, na.rm = TRUE)
)
# Summary by group
data %>%
group_by(treatment) %>%
summarize(
mean_outcome = mean(outcome, na.rm = TRUE),
sd_outcome = sd(outcome, na.rm = TRUE),
n = n()
)
import pandas as pd
# Basic summary
data[["income", "age", "education"]].describe()
# Detailed summary with percentiles
data["income"].describe(percentiles=[.25, .5, .75])
# Summary by group
data.groupby("treatment")["outcome"].agg(["mean", "std", "count"])
Tabulations and Cross-tabs
* One-way tabulation
tab education
* Two-way tabulation
tab education treatment
* With row/column percentages
tab education treatment, row col
* Tabulate continuous variable
tabstat income, by(education) stat(mean sd n)
# One-way tabulation
table(data$education)
# Two-way tabulation
table(data$education, data$treatment)
# With proportions
prop.table(table(data$education, data$treatment), margin = 1) # row %
prop.table(table(data$education, data$treatment), margin = 2) # col %
# Summarize continuous by group
data %>%
group_by(education) %>%
summarize(
mean_income = mean(income, na.rm = TRUE),
sd_income = sd(income, na.rm = TRUE),
n = n()
)
# One-way tabulation
data["education"].value_counts()
# Two-way tabulation (cross-tab)
pd.crosstab(data["education"], data["treatment"])
# With row/column percentages
pd.crosstab(data["education"], data["treatment"], normalize="index") # row %
pd.crosstab(data["education"], data["treatment"], normalize="columns") # col %
# Summarize continuous by group
data.groupby("education")["income"].agg(["mean", "std", "count"])
Balance Tables
A balance table compares characteristics across treatment and control groups. This is essential for any study with treatment/control design.
* Install if needed: ssc install balancetable
* Create balance table
balancetable treatment age female income education ///
using "balance_table.tex", ///
ctitles("Control" "Treatment" "Difference") ///
varlabels replace
* Manual approach with estout
estpost ttest age female income education, by(treatment)
esttab using "balance.tex", ///
cells("mu_1 mu_2 b se") ///
label replace
library(modelsummary)
# Create balance table
datasummary_balance(
~ treatment,
data = data,
fmt = 2,
dinm_statistic = "p.value"
)
# Export to file
datasummary_balance(
~ treatment,
data = data,
output = "balance_table.tex"
)
from scipy import stats
# Create balance table manually
vars = ["age", "female", "income", "education"]
balance = []
for var in vars:
control = data.loc[data["treatment"] == 0, var]
treated = data.loc[data["treatment"] == 1, var]
tstat, pval = stats.ttest_ind(control, treated, nan_policy="omit")
balance.append({
"Variable": var,
"Control": control.mean(),
"Treatment": treated.mean(),
"Difference": treated.mean() - control.mean(),
"p-value": pval
})
balance_df = pd.DataFrame(balance)
print(balance_df.to_string(index=False))
Data Visualization
Graphs Should Stand Alone
Every figure should be understandable without reading the paper. Include clear axis labels, legends, and titles. Use notes to explain any non-obvious details.
Histograms and Density Plots
* Histogram
histogram income, frequency ///
title("Distribution of Income") ///
xtitle("Annual Income ($)") ytitle("Frequency")
graph export "hist_income.png", replace
* Kernel density
kdensity income, ///
title("Income Distribution") ///
xtitle("Annual Income ($)")
* Overlapping densities by group
twoway (kdensity income if treatment == 0, lcolor(blue)) ///
(kdensity income if treatment == 1, lcolor(red)), ///
legend(label(1 "Control") label(2 "Treatment")) ///
title("Income by Treatment Status")
library(ggplot2)
# Histogram
ggplot(data, aes(x = income)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Distribution of Income",
x = "Annual Income ($)", y = "Frequency") +
theme_minimal()
ggsave("hist_income.png", width = 8, height = 6)
# Kernel density
ggplot(data, aes(x = income)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Income Distribution", x = "Annual Income ($)")
# Overlapping densities by group
ggplot(data, aes(x = income, fill = factor(treatment))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("blue", "red"),
labels = c("Control", "Treatment")) +
labs(title = "Income by Treatment Status", fill = "Group")
import matplotlib.pyplot as plt
import seaborn as sns
# Histogram
plt.figure(figsize=(8, 6))
plt.hist(data["income"].dropna(), bins=30, color="steelblue", edgecolor="white")
plt.title("Distribution of Income")
plt.xlabel("Annual Income ($)")
plt.ylabel("Frequency")
plt.savefig("hist_income.png")
# Kernel density
data["income"].plot(kind="kde", title="Income Distribution")
# Overlapping densities by group
fig, ax = plt.subplots()
for treat, color, label in [(0, "blue", "Control"), (1, "red", "Treatment")]:
data.loc[data["treatment"] == treat, "income"].plot(
kind="kde", ax=ax, color=color, label=label
)
ax.legend()
ax.set_title("Income by Treatment Status")
Scatter Plots
* Basic scatter plot
scatter income education, ///
title("Income vs. Education") ///
xtitle("Years of Education") ytitle("Income")
* With fitted line
twoway (scatter income education) ///
(lfit income education), ///
legend(off) ///
title("Income vs. Education")
* Binned scatter plot (binscatter)
* Install: ssc install binscatter
binscatter income education, ///
title("Income vs. Education") ///
xtitle("Years of Education") ytitle("Mean Income")
# Basic scatter plot
ggplot(data, aes(x = education, y = income)) +
geom_point(alpha = 0.5) +
labs(title = "Income vs. Education",
x = "Years of Education", y = "Income")
# With fitted line
ggplot(data, aes(x = education, y = income)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Income vs. Education")
# Binned scatter plot
library(binsreg)
binsreg(y = data$income, x = data$education)
# Basic scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(data["education"], data["income"], alpha=0.5)
plt.title("Income vs. Education")
plt.xlabel("Years of Education")
plt.ylabel("Income")
# With fitted line
import numpy as np
z = np.polyfit(data["education"], data["income"], 1)
p = np.poly1d(z)
plt.scatter(data["education"], data["income"], alpha=0.5)
plt.plot(data["education"].sort_values(), p(data["education"].sort_values()), "r-")
# Seaborn regression plot (easier)
sns.regplot(x="education", y="income", data=data)
Time Series and Line Plots
* Collapse to year-level means
preserve
collapse (mean) income, by(year)
* Line plot
twoway line income year, ///
title("Average Income Over Time") ///
xtitle("Year") ytitle("Mean Income")
restore
* Multiple lines by group
preserve
collapse (mean) income, by(year treatment)
twoway (line income year if treatment == 0) ///
(line income year if treatment == 1), ///
legend(label(1 "Control") label(2 "Treatment"))
restore
# Aggregate to year level
yearly <- data %>%
group_by(year) %>%
summarize(mean_income = mean(income, na.rm = TRUE))
# Line plot
ggplot(yearly, aes(x = year, y = mean_income)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(title = "Average Income Over Time",
x = "Year", y = "Mean Income")
# Multiple lines by group
yearly_by_group <- data %>%
group_by(year, treatment) %>%
summarize(mean_income = mean(income, na.rm = TRUE))
ggplot(yearly_by_group, aes(x = year, y = mean_income,
color = factor(treatment))) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_color_manual(values = c("blue", "red"),
labels = c("Control", "Treatment")) +
labs(color = "Group")
# Aggregate to year level
yearly = data.groupby("year")["income"].mean()
# Line plot
plt.figure(figsize=(8, 6))
plt.plot(yearly.index, yearly.values, marker="o")
plt.title("Average Income Over Time")
plt.xlabel("Year")
plt.ylabel("Mean Income")
# Multiple lines by group
yearly_by_group = data.groupby(["year", "treatment"])["income"].mean().unstack()
yearly_by_group.plot(marker="o")
plt.legend(["Control", "Treatment"])
Box Plots
* Box plot by group
graph box income, over(education) ///
title("Income Distribution by Education Level")
# Box plot by group
ggplot(data, aes(x = factor(education), y = income)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
labs(title = "Income Distribution by Education Level",
x = "Education Level", y = "Income")
# Box plot by group
data.boxplot(column="income", by="education")
plt.title("Income Distribution by Education Level")
plt.suptitle("") # Remove automatic title
plt.xlabel("Education Level")
plt.ylabel("Income")
# Or with seaborn
sns.boxplot(x="education", y="income", data=data)
Publication Tables
Never Copy Numbers by Hand
Export tables directly from your code to LaTeX or Word. Manual copying introduces errors and makes your work non-reproducible. Use estout/esttab in Stata or modelsummary in R.
Summary Statistics Tables
* Install estout if needed: ssc install estout
* Create summary statistics table
estpost summarize income age education female
* Export to LaTeX
esttab using "summary_stats.tex", ///
cells("mean(fmt(2)) sd(fmt(2)) min max count") ///
label noobs replace ///
title("Summary Statistics")
* Export to Word/RTF
esttab using "summary_stats.rtf", ///
cells("mean(fmt(2)) sd(fmt(2)) min max count") ///
label noobs replace
library(modelsummary)
# Summary statistics table
datasummary(
income + age + education + female ~ Mean + SD + Min + Max + N,
data = data,
fmt = 2
)
# Export to LaTeX
datasummary(
income + age + education + female ~ Mean + SD + Min + Max + N,
data = data,
output = "summary_stats.tex"
)
# Export to Word
datasummary(
income + age + education + female ~ Mean + SD + Min + Max + N,
data = data,
output = "summary_stats.docx"
)
# Summary statistics table
summary = data[["income", "age", "education", "female"]].describe().T
summary = summary[["mean", "std", "min", "max", "count"]]
print(summary.round(2))
# Export to LaTeX
print(summary.to_latex(float_format="%.2f"))
# Save to file
summary.to_latex("summary_stats.tex", float_format="%.2f")
# Export to Excel (for Word)
summary.to_excel("summary_stats.xlsx")
Regression Tables
* Run regressions and store results
eststo clear
eststo: reg income treatment
eststo: reg income treatment age female
eststo: reg income treatment age female education
* Create table
esttab using "regression_table.tex", ///
se star(* 0.10 ** 0.05 *** 0.01) ///
label r2 ///
title("Effect of Treatment on Income") ///
mtitles("(1)" "(2)" "(3)") ///
addnotes("Standard errors in parentheses") ///
replace
* Common options:
* se - show standard errors (not t-stats)
* star(...) - significance stars
* label - use variable labels
* r2 - show R-squared
* b(3) se(3) - 3 decimal places
* drop(...) - hide certain coefficients
library(modelsummary)
library(fixest)
# Run regressions
m1 <- lm(income ~ treatment, data = data)
m2 <- lm(income ~ treatment + age + female, data = data)
m3 <- lm(income ~ treatment + age + female + education, data = data)
# Create table
modelsummary(
list("(1)" = m1, "(2)" = m2, "(3)" = m3),
stars = c('*' = 0.10, '**' = 0.05, '***' = 0.01),
gof_omit = "AIC|BIC|Log",
title = "Effect of Treatment on Income",
notes = "Standard errors in parentheses"
)
# Export to LaTeX
modelsummary(
list("(1)" = m1, "(2)" = m2, "(3)" = m3),
stars = c('*' = 0.10, '**' = 0.05, '***' = 0.01),
output = "regression_table.tex"
)
# Export to Word
modelsummary(
list("(1)" = m1, "(2)" = m2, "(3)" = m3),
output = "regression_table.docx"
)
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer
# Run regressions
m1 = smf.ols("income ~ treatment", data=data).fit()
m2 = smf.ols("income ~ treatment + age + female", data=data).fit()
m3 = smf.ols("income ~ treatment + age + female + education", data=data).fit()
# Create table with stargazer
stargazer = Stargazer([m1, m2, m3])
stargazer.title("Effect of Treatment on Income")
print(stargazer.render_latex())
# Or manual approach
from statsmodels.iolib.summary2 import summary_col
print(summary_col([m1, m2, m3], stars=True).as_latex())
Customizing Tables
* Add mean of dependent variable
eststo clear
eststo: reg income treatment age female
estadd local controls "Yes"
sum income if e(sample)
estadd scalar mean_y = r(mean)
* Include in table
esttab using "table.tex", ///
se star(* 0.10 ** 0.05 *** 0.01) ///
scalars("mean_y Mean of Dep. Var." "controls Controls") ///
label replace
* Rename coefficients in output
esttab, rename(treatment "Treatment Effect")
# Add rows for controls and mean of DV
rows <- tribble(
~term, ~"(1)", ~"(2)",
"Controls", "No", "Yes",
"Mean of DV", sprintf("%.2f", mean(data$income)),
sprintf("%.2f", mean(data$income))
)
modelsummary(
list("(1)" = m1, "(2)" = m2),
add_rows = rows,
coef_rename = c("treatment" = "Treatment Effect"),
stars = TRUE
)
# Add custom rows with stargazer
stargazer = Stargazer([m1, m2])
stargazer.add_line("Controls", ["No", "Yes"])
stargazer.add_line("Mean of DV", [f"{data['income'].mean():.2f}"] * 2)
stargazer.rename_covariates({"treatment": "Treatment Effect"})
print(stargazer.render_latex())
Table Checklist
- Do all columns have clear headers?
- Are standard errors in parentheses (not t-stats)?
- Is the dependent variable clearly labeled?
- Are significance levels defined in a note?
- Is sample size reported for each column?
- Are controls indicated (even if not shown)?