Advanced

Loops, functions, large datasets, and best practices

Loops and Iteration

Basic Loops

* Loop over numbers
forvalues i = 1/10 {
    di "Iteration `i'"
}

* Loop over values in a list
foreach var in income age education {
    summarize `var'
}

* Loop over variables matching a pattern
foreach var of varlist inc* {
    replace `var' = . if `var' < 0
}

* Loop over files
local files: dir "data/" files "*.csv"
foreach f of local files {
    di "Processing `f'"
    import delimited "data/`f'", clear
    * ... do something ...
}
# Loop over numbers
for (i in 1:10) {
  print(paste("Iteration", i))
}

# Loop over values in a vector
for (var in c("income", "age", "education")) {
  print(summary(data[[var]]))
}

# Loop over files
files <- list.files("data/", pattern = "*.csv", full.names = TRUE)
for (f in files) {
  print(paste("Processing", f))
  df <- read_csv(f)
  # ... do something ...
}

# Better: use lapply/map for functional approach
library(purrr)
all_data <- map_dfr(files, read_csv)

Running Multiple Regressions

* Run same regression for different outcomes
eststo clear
foreach outcome in income hours wages {
    eststo `outcome': reg `outcome' treatment age female, robust
}
esttab, se star(* 0.1 ** 0.05 *** 0.01)

* Run same regression for different subgroups
eststo clear
levelsof state, local(states)
foreach s of local states {
    eststo `s': reg income treatment if state == "`s'", robust
}
esttab
library(fixest)
library(purrr)

# Run same regression for different outcomes
outcomes <- c("income", "hours", "wages")
models <- map(outcomes, ~ feols(
  as.formula(paste(.x, "~ treatment + age + female")),
  data = data,
  vcov = "hetero"
))
names(models) <- outcomes
etable(models)

# Run same regression for different subgroups
states <- unique(data$state)
models <- map(states, ~ feols(
  income ~ treatment,
  data = filter(data, state == .x),
  vcov = "hetero"
))
names(models) <- states
etable(models)

Storing Results

* Create empty dataset to store results
clear
set obs 50
gen state = ""
gen beta = .
gen se = .
gen n = .
save "results.dta", replace

* Loop and store
levelsof state, local(states)
local row = 1
foreach s of local states {
    reg income treatment if state == "`s'", robust

    use "results.dta", clear
    replace state = "`s'" in `row'
    replace beta = _b[treatment] in `row'
    replace se = _se[treatment] in `row'
    replace n = e(N) in `row'
    save "results.dta", replace

    local row = `row' + 1
}
# Store results in a data frame
results <- map_dfr(unique(data$state), function(s) {
  model <- lm(income ~ treatment, data = filter(data, state == s))

  tibble(
    state = s,
    beta = coef(model)["treatment"],
    se = sqrt(vcov(model)["treatment", "treatment"]),
    n = nobs(model)
  )
})

# View results
print(results)

# Or use broom for tidy model output
library(broom)
models <- data %>%
  group_by(state) %>%
  do(model = lm(income ~ treatment, data = .))

results <- models %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied)

Functions and Programs

Don't Repeat Yourself (DRY)

If you're copy-pasting code and changing small things, write a function instead. This reduces errors and makes your code easier to maintain.

Writing Functions

* Define a program (Stata's version of functions)
capture program drop my_regression
program define my_regression
    syntax varlist, outcome(string) [controls(string)]

    * Parse variable list
    local treatment: word 1 of `varlist'

    * Run regression
    if "`controls'" != "" {
        reg `outcome' `treatment' `controls', robust
    }
    else {
        reg `outcome' `treatment', robust
    }

    * Store and display results
    di "Treatment effect: " _b[`treatment']
end

* Use the program
my_regression treatment, outcome(income) controls(age female)
# Define a function
run_regression <- function(data, outcome, treatment, controls = NULL) {

  # Build formula
  if (is.null(controls)) {
    formula <- as.formula(paste(outcome, "~", treatment))
  } else {
    formula <- as.formula(paste(outcome, "~", treatment, "+",
                                paste(controls, collapse = " + ")))
  }

  # Run regression
  model <- lm(formula, data = data)

  # Return tidy results
  broom::tidy(model, conf.int = TRUE)
}

# Use the function
run_regression(data, "income", "treatment", c("age", "female"))

# Apply to multiple outcomes
outcomes <- c("income", "hours", "wages")
results <- map_dfr(outcomes, ~ run_regression(data, .x, "treatment"),
                   .id = "outcome")

Modular Code Structure

* master.do - runs everything
clear all
set more off

* Set paths
global root "/Users/me/Dropbox/project"
global build "$root/build"
global analysis "$root/analysis"

* Run scripts in order
do "$build/code/01_import.do"
do "$build/code/02_clean.do"
do "$analysis/code/01_summary_stats.do"
do "$analysis/code/02_regressions.do"

di "Master script complete!"
# master.R - runs everything
rm(list = ls())

# Set paths
root <- "/Users/me/Dropbox/project"
build <- file.path(root, "build")
analysis <- file.path(root, "analysis")

# Load packages
library(tidyverse)
library(haven)
library(fixest)

# Run scripts in order
source(file.path(build, "code", "01_import.R"))
source(file.path(build, "code", "02_clean.R"))
source(file.path(analysis, "code", "01_summary_stats.R"))
source(file.path(analysis, "code", "02_regressions.R"))

cat("Master script complete!\n")

Working with Large Datasets

Memory Management

* Check current memory usage
memory

* Increase memory limit (Stata/MP)
set max_memory 16g

* Compress data to use less memory
compress

* Keep only needed variables
keep id year treatment outcome

* Drop unnecessary observations early
drop if year < 2000
# Check object sizes
object.size(data)
pryr::mem_used()

# Remove objects you don't need
rm(temp_data, old_results)
gc()  # Force garbage collection

# Select only needed columns early
data <- data %>% select(id, year, treatment, outcome)

# Filter rows early
data <- data %>% filter(year >= 2000)

Efficient Data Formats

* Stata's native .dta format is usually fine
* For very large data, consider:

* 1. Use compress to shrink variable storage
compress

* 2. Convert strings to numeric where possible
encode state_name, gen(state_num)
drop state_name

* 3. Save in chunks if needed
preserve
keep if year <= 2010
save "data_pre2010.dta", replace
restore
keep if year > 2010
save "data_post2010.dta", replace
# Use efficient file formats

# Parquet: columnar format, very fast, good compression
library(arrow)
write_parquet(data, "data.parquet")
data <- read_parquet("data.parquet")

# fst: extremely fast for R-to-R
library(fst)
write_fst(data, "data.fst")
data <- read_fst("data.fst")

# data.table: faster than dplyr for large data
library(data.table)
dt <- fread("data.csv")  # Much faster than read_csv
dt[year >= 2000, .(mean_income = mean(income)), by = state]

Processing in Chunks

* Process data year by year
forvalues y = 2000/2020 {
    use "big_data.dta" if year == `y', clear

    * Process this year's data
    collapse (mean) income, by(state)
    gen year = `y'

    * Append to results
    if `y' == 2000 {
        save "results.dta", replace
    }
    else {
        append using "results.dta"
        save "results.dta", replace
    }
}
# Read and process in chunks
library(readr)

# Define processing function
process_chunk <- function(chunk, pos) {
  chunk %>%
    group_by(state, year) %>%
    summarize(mean_income = mean(income), .groups = "drop")
}

# Read in chunks of 100,000 rows
results <- read_csv_chunked(
  "big_data.csv",
  DataFrameCallback$new(process_chunk),
  chunk_size = 100000
)

# Or use arrow for out-of-memory data
library(arrow)
ds <- open_dataset("data_folder/")  # Can be larger than RAM
ds %>%
  filter(year >= 2000) %>%
  group_by(state) %>%
  summarize(mean_income = mean(income)) %>%
  collect()  # Only materializes the result

Speed Tips

  • Filter early: Remove unneeded rows/columns before processing
  • Use indexes: In Stata, xtset or sort before merging
  • Avoid loops when possible: Vectorized operations are much faster
  • Use specialized packages: reghdfe in Stata, fixest in R are optimized for large data

Best Practices

Code Style

/*==============================================================================
    analysis.do
    Purpose: Main regression analysis
    Author: Your Name
    Date: 2024-01-15
==============================================================================*/

* ============================================
* Setup
* ============================================
clear all
set more off

global root "/Users/me/Dropbox/project"
global data "$root/build/output"
global output "$root/analysis/output"

* ============================================
* Load Data
* ============================================
use "$data/analysis_sample.dta", clear

* ============================================
* Main Regressions
* ============================================

* Baseline specification
reg income treatment, robust

* Add controls
reg income treatment age female education, robust

* ============================================
* Export Results
* ============================================
esttab using "$output/tables/main_results.tex", replace
#===============================================================================
#   analysis.R
#   Purpose: Main regression analysis
#   Author: Your Name
#   Date: 2024-01-15
#===============================================================================

# ============================================
# Setup
# ============================================
rm(list = ls())

library(tidyverse)
library(fixest)
library(modelsummary)

root <- "/Users/me/Dropbox/project"
data_dir <- file.path(root, "build", "output")
output_dir <- file.path(root, "analysis", "output")

# ============================================
# Load Data
# ============================================
data <- read_dta(file.path(data_dir, "analysis_sample.dta"))

# ============================================
# Main Regressions
# ============================================

# Baseline specification
m1 <- feols(income ~ treatment, data = data, vcov = "hetero")

# Add controls
m2 <- feols(income ~ treatment + age + female + education,
            data = data, vcov = "hetero")

# ============================================
# Export Results
# ============================================
modelsummary(list(m1, m2),
             output = file.path(output_dir, "tables", "main_results.tex"))

Debugging

* Print diagnostic information
di "Number of observations: " _N
tab treatment, missing
summarize income if treatment == 1

* Use assert to check assumptions
assert income > 0
assert !missing(treatment)

* Trace through a program
set trace on
my_program
set trace off

* Check for duplicates
duplicates report id year
assert r(unique_value) == r(N)
# Print diagnostic information
cat("Number of observations:", nrow(data), "\n")
table(data$treatment, useNA = "ifany")
summary(filter(data, treatment == 1)$income)

# Use stopifnot to check assumptions
stopifnot(all(data$income > 0, na.rm = TRUE))
stopifnot(!any(is.na(data$treatment)))

# Debug interactively
browser()  # Insert in code to pause execution
debug(my_function)  # Step through function

# Check for duplicates
data %>%
  group_by(id, year) %>%
  filter(n() > 1) %>%
  nrow() -> n_dups
stopifnot(n_dups == 0)

Version Control Basics

# Initialize git in your project folder
cd ~/Dropbox/my_project
git init

# Create .gitignore to exclude data files
echo "*.dta" >> .gitignore
echo "*.csv" >> .gitignore
echo "*.log" >> .gitignore

# Stage and commit your code
git add *.do *.R
git commit -m "Initial commit of analysis code"

# View history
git log --oneline

# See what changed
git diff

Reproducibility Checklist

  • Can someone else run your code from start to finish without errors?
  • Are all file paths relative (not absolute like C:/Users/John/...)?
  • Are random seeds set for any stochastic processes?
  • Is your code version controlled?
  • Can you reproduce your exact results a year from now?
← Regression Back to Tutorial Home