Advanced
Loops, functions, large datasets, and best practices
Loops and Iteration
Basic Loops
* Loop over numbers
forvalues i = 1/10 {
di "Iteration `i'"
}
* Loop over values in a list
foreach var in income age education {
summarize `var'
}
* Loop over variables matching a pattern
foreach var of varlist inc* {
replace `var' = . if `var' < 0
}
* Loop over files
local files: dir "data/" files "*.csv"
foreach f of local files {
di "Processing `f'"
import delimited "data/`f'", clear
* ... do something ...
}
# Loop over numbers
for (i in 1:10) {
print(paste("Iteration", i))
}
# Loop over values in a vector
for (var in c("income", "age", "education")) {
print(summary(data[[var]]))
}
# Loop over files
files <- list.files("data/", pattern = "*.csv", full.names = TRUE)
for (f in files) {
print(paste("Processing", f))
df <- read_csv(f)
# ... do something ...
}
# Better: use lapply/map for functional approach
library(purrr)
all_data <- map_dfr(files, read_csv)
Running Multiple Regressions
* Run same regression for different outcomes
eststo clear
foreach outcome in income hours wages {
eststo `outcome': reg `outcome' treatment age female, robust
}
esttab, se star(* 0.1 ** 0.05 *** 0.01)
* Run same regression for different subgroups
eststo clear
levelsof state, local(states)
foreach s of local states {
eststo `s': reg income treatment if state == "`s'", robust
}
esttab
library(fixest)
library(purrr)
# Run same regression for different outcomes
outcomes <- c("income", "hours", "wages")
models <- map(outcomes, ~ feols(
as.formula(paste(.x, "~ treatment + age + female")),
data = data,
vcov = "hetero"
))
names(models) <- outcomes
etable(models)
# Run same regression for different subgroups
states <- unique(data$state)
models <- map(states, ~ feols(
income ~ treatment,
data = filter(data, state == .x),
vcov = "hetero"
))
names(models) <- states
etable(models)
Storing Results
* Create empty dataset to store results
clear
set obs 50
gen state = ""
gen beta = .
gen se = .
gen n = .
save "results.dta", replace
* Loop and store
levelsof state, local(states)
local row = 1
foreach s of local states {
reg income treatment if state == "`s'", robust
use "results.dta", clear
replace state = "`s'" in `row'
replace beta = _b[treatment] in `row'
replace se = _se[treatment] in `row'
replace n = e(N) in `row'
save "results.dta", replace
local row = `row' + 1
}
# Store results in a data frame
results <- map_dfr(unique(data$state), function(s) {
model <- lm(income ~ treatment, data = filter(data, state == s))
tibble(
state = s,
beta = coef(model)["treatment"],
se = sqrt(vcov(model)["treatment", "treatment"]),
n = nobs(model)
)
})
# View results
print(results)
# Or use broom for tidy model output
library(broom)
models <- data %>%
group_by(state) %>%
do(model = lm(income ~ treatment, data = .))
results <- models %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied)
Functions and Programs
Don't Repeat Yourself (DRY)
If you're copy-pasting code and changing small things, write a function instead. This reduces errors and makes your code easier to maintain.
Writing Functions
* Define a program (Stata's version of functions)
capture program drop my_regression
program define my_regression
syntax varlist, outcome(string) [controls(string)]
* Parse variable list
local treatment: word 1 of `varlist'
* Run regression
if "`controls'" != "" {
reg `outcome' `treatment' `controls', robust
}
else {
reg `outcome' `treatment', robust
}
* Store and display results
di "Treatment effect: " _b[`treatment']
end
* Use the program
my_regression treatment, outcome(income) controls(age female)
# Define a function
run_regression <- function(data, outcome, treatment, controls = NULL) {
# Build formula
if (is.null(controls)) {
formula <- as.formula(paste(outcome, "~", treatment))
} else {
formula <- as.formula(paste(outcome, "~", treatment, "+",
paste(controls, collapse = " + ")))
}
# Run regression
model <- lm(formula, data = data)
# Return tidy results
broom::tidy(model, conf.int = TRUE)
}
# Use the function
run_regression(data, "income", "treatment", c("age", "female"))
# Apply to multiple outcomes
outcomes <- c("income", "hours", "wages")
results <- map_dfr(outcomes, ~ run_regression(data, .x, "treatment"),
.id = "outcome")
Modular Code Structure
* master.do - runs everything
clear all
set more off
* Set paths
global root "/Users/me/Dropbox/project"
global build "$root/build"
global analysis "$root/analysis"
* Run scripts in order
do "$build/code/01_import.do"
do "$build/code/02_clean.do"
do "$analysis/code/01_summary_stats.do"
do "$analysis/code/02_regressions.do"
di "Master script complete!"
# master.R - runs everything
rm(list = ls())
# Set paths
root <- "/Users/me/Dropbox/project"
build <- file.path(root, "build")
analysis <- file.path(root, "analysis")
# Load packages
library(tidyverse)
library(haven)
library(fixest)
# Run scripts in order
source(file.path(build, "code", "01_import.R"))
source(file.path(build, "code", "02_clean.R"))
source(file.path(analysis, "code", "01_summary_stats.R"))
source(file.path(analysis, "code", "02_regressions.R"))
cat("Master script complete!\n")
Working with Large Datasets
Memory Management
* Check current memory usage
memory
* Increase memory limit (Stata/MP)
set max_memory 16g
* Compress data to use less memory
compress
* Keep only needed variables
keep id year treatment outcome
* Drop unnecessary observations early
drop if year < 2000
# Check object sizes
object.size(data)
pryr::mem_used()
# Remove objects you don't need
rm(temp_data, old_results)
gc() # Force garbage collection
# Select only needed columns early
data <- data %>% select(id, year, treatment, outcome)
# Filter rows early
data <- data %>% filter(year >= 2000)
Efficient Data Formats
* Stata's native .dta format is usually fine
* For very large data, consider:
* 1. Use compress to shrink variable storage
compress
* 2. Convert strings to numeric where possible
encode state_name, gen(state_num)
drop state_name
* 3. Save in chunks if needed
preserve
keep if year <= 2010
save "data_pre2010.dta", replace
restore
keep if year > 2010
save "data_post2010.dta", replace
# Use efficient file formats
# Parquet: columnar format, very fast, good compression
library(arrow)
write_parquet(data, "data.parquet")
data <- read_parquet("data.parquet")
# fst: extremely fast for R-to-R
library(fst)
write_fst(data, "data.fst")
data <- read_fst("data.fst")
# data.table: faster than dplyr for large data
library(data.table)
dt <- fread("data.csv") # Much faster than read_csv
dt[year >= 2000, .(mean_income = mean(income)), by = state]
Processing in Chunks
* Process data year by year
forvalues y = 2000/2020 {
use "big_data.dta" if year == `y', clear
* Process this year's data
collapse (mean) income, by(state)
gen year = `y'
* Append to results
if `y' == 2000 {
save "results.dta", replace
}
else {
append using "results.dta"
save "results.dta", replace
}
}
# Read and process in chunks
library(readr)
# Define processing function
process_chunk <- function(chunk, pos) {
chunk %>%
group_by(state, year) %>%
summarize(mean_income = mean(income), .groups = "drop")
}
# Read in chunks of 100,000 rows
results <- read_csv_chunked(
"big_data.csv",
DataFrameCallback$new(process_chunk),
chunk_size = 100000
)
# Or use arrow for out-of-memory data
library(arrow)
ds <- open_dataset("data_folder/") # Can be larger than RAM
ds %>%
filter(year >= 2000) %>%
group_by(state) %>%
summarize(mean_income = mean(income)) %>%
collect() # Only materializes the result
Speed Tips
- Filter early: Remove unneeded rows/columns before processing
- Use indexes: In Stata,
xtsetorsortbefore merging - Avoid loops when possible: Vectorized operations are much faster
- Use specialized packages:
reghdfein Stata,fixestin R are optimized for large data
Best Practices
Code Style
/*==============================================================================
analysis.do
Purpose: Main regression analysis
Author: Your Name
Date: 2024-01-15
==============================================================================*/
* ============================================
* Setup
* ============================================
clear all
set more off
global root "/Users/me/Dropbox/project"
global data "$root/build/output"
global output "$root/analysis/output"
* ============================================
* Load Data
* ============================================
use "$data/analysis_sample.dta", clear
* ============================================
* Main Regressions
* ============================================
* Baseline specification
reg income treatment, robust
* Add controls
reg income treatment age female education, robust
* ============================================
* Export Results
* ============================================
esttab using "$output/tables/main_results.tex", replace
#===============================================================================
# analysis.R
# Purpose: Main regression analysis
# Author: Your Name
# Date: 2024-01-15
#===============================================================================
# ============================================
# Setup
# ============================================
rm(list = ls())
library(tidyverse)
library(fixest)
library(modelsummary)
root <- "/Users/me/Dropbox/project"
data_dir <- file.path(root, "build", "output")
output_dir <- file.path(root, "analysis", "output")
# ============================================
# Load Data
# ============================================
data <- read_dta(file.path(data_dir, "analysis_sample.dta"))
# ============================================
# Main Regressions
# ============================================
# Baseline specification
m1 <- feols(income ~ treatment, data = data, vcov = "hetero")
# Add controls
m2 <- feols(income ~ treatment + age + female + education,
data = data, vcov = "hetero")
# ============================================
# Export Results
# ============================================
modelsummary(list(m1, m2),
output = file.path(output_dir, "tables", "main_results.tex"))
Debugging
* Print diagnostic information
di "Number of observations: " _N
tab treatment, missing
summarize income if treatment == 1
* Use assert to check assumptions
assert income > 0
assert !missing(treatment)
* Trace through a program
set trace on
my_program
set trace off
* Check for duplicates
duplicates report id year
assert r(unique_value) == r(N)
# Print diagnostic information
cat("Number of observations:", nrow(data), "\n")
table(data$treatment, useNA = "ifany")
summary(filter(data, treatment == 1)$income)
# Use stopifnot to check assumptions
stopifnot(all(data$income > 0, na.rm = TRUE))
stopifnot(!any(is.na(data$treatment)))
# Debug interactively
browser() # Insert in code to pause execution
debug(my_function) # Step through function
# Check for duplicates
data %>%
group_by(id, year) %>%
filter(n() > 1) %>%
nrow() -> n_dups
stopifnot(n_dups == 0)
Version Control Basics
# Initialize git in your project folder
cd ~/Dropbox/my_project
git init
# Create .gitignore to exclude data files
echo "*.dta" >> .gitignore
echo "*.csv" >> .gitignore
echo "*.log" >> .gitignore
# Stage and commit your code
git add *.do *.R
git commit -m "Initial commit of analysis code"
# View history
git log --oneline
# See what changed
git diff
Reproducibility Checklist
- Can someone else run your code from start to finish without errors?
- Are all file paths relative (not absolute like
C:/Users/John/...)? - Are random seeds set for any stochastic processes?
- Is your code version controlled?
- Can you reproduce your exact results a year from now?