ECB

Webscrapping ECB statements
Author
Andro Asatashvili
This repository contains an R pipeline that systematically scrapes all ECB Monetary Policy Statements and associated press conference Q&A from the official ECB website (covering both pre- and post-2020 formats).
The script:
Crawls the ECB index and yearly snippet pages
Extracts statement and Q&A text using robust HTML parsing
Automatically infers meeting dates from ECB document identifiers
Cleans legacy Unicode and encoding issues common in older ECB releases (1990s–2010s)
Exports each meeting as a standalone, LaTeX-safe PDF
Generates an index CSV linking dates, source URLs, and output files
The pipeline is designed for reproducibility, historical completeness, and downstream text analysis (e.g. monetary policy research, NLP, or archival use).
# ============================================================
# ECB Monetary Policy Statements — FULLY ROBUST BULK SCRAPER + PDFs ONLY
# FINAL, FIXED VERSION (+ LaTeX unicode sanitiser)
# ✔ works pre-2020 and post-2020
# ✔ supports old/new URL patterns
# ✔ robust statement/Q&A split
# ✔ correct dates (fixes invalid 'trim' error)
# ✔ NO .Rmd files left behind
# ✔ fixes LaTeX Unicode combining marks (e.g., U+0336)
# ============================================================

# install.packages(c("xml2","rvest","stringr","dplyr","purrr","readr","rmarkdown"))
# install.packages("tinytex"); tinytex::install_tinytex()

library(xml2)
library(rvest)
library(stringr)
library(dplyr)
library(purrr)
library(readr)
library(rmarkdown)

# ---------------------------
# PARAMETERS
# ---------------------------
INDEX_URL     <- "https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/html/index.en.html"
YEAR_FILTER   <- NA_integer_   # e.g. 2019, or NA for all
MAX_DOCS      <- Inf
CRAWL_DELAY_S <- 1
OUT_DIR       <- file.path(getwd(), "ECB_MPS_PDFs")

dir.create(OUT_DIR, showWarnings = FALSE, recursive = TRUE)

# ---------------------------
# HELPERS
# ---------------------------
safe_read_html <- function(url) {
  tryCatch(xml2::read_html(url), error = function(e) NULL)
}

get_paragraph_nodes <- function(pg) {
  for (sel in c("main .section p", "main p", "body p")) {
    nodes <- rvest::html_elements(pg, sel)
    if (length(nodes) > 0) return(nodes)
  }
  rvest::html_elements(pg, "p")
}

split_statement_qa <- function(p_nodes) {
  
  paras <- p_nodes |>
    rvest::html_text2() |>
    stringr::str_replace_all("\u00a0", " ") |>
    stringr::str_squish()
  
  full_text <- paste(paras, collapse = "\n\n")
  
  # 1) ECB divider "* * *"
  star_idx <- which(stringr::str_detect(paras, "^(\\*\\s*){3}$"))[1]
  if (!is.na(star_idx)) {
    return(list(
      full_text = full_text,
      statement = paste(paras[1:(star_idx - 1)], collapse = "\n\n"),
      qa        = paste(paras[(star_idx + 1):length(paras)], collapse = "\n\n")
    ))
  }
  
  # 2) Pre-2020 wording
  disp_idx <- which(stringr::str_detect(
    paras,
    "^We are now at your disposal for questions\\.?$"
  ))[1]
  if (!is.na(disp_idx)) {
    return(list(
      full_text = full_text,
      statement = paste(paras[1:disp_idx], collapse = "\n\n"),
      qa        = paste(paras[(disp_idx + 1):length(paras)], collapse = "\n\n")
    ))
  }
  
  # 3) Post-2020 wording
  ready_idx <- which(paras == "We are now ready to take your questions.")[1]
  if (!is.na(ready_idx)) {
    return(list(
      full_text = full_text,
      statement = paste(paras[1:ready_idx], collapse = "\n\n"),
      qa        = paste(paras[(ready_idx + 1):length(paras)], collapse = "\n\n")
    ))
  }
  
  # 4) Fallback: first clear question
  q_idx <- which(stringr::str_detect(
    paras,
    "^(Q(\\.|:)|Question\\b|My first question\\b|I have .* question)"
  ))[1]
  
  if (!is.na(q_idx) && q_idx > 1) {
    return(list(
      full_text = full_text,
      statement = paste(paras[1:(q_idx - 1)], collapse = "\n\n"),
      qa        = paste(paras[q_idx:length(paras)], collapse = "\n\n")
    ))
  }
  
  list(full_text = full_text, statement = full_text, qa = "")
}

scrape_one_mps <- function(url) {
  pg <- xml2::read_html(url)
  p_nodes <- get_paragraph_nodes(pg)
  split_statement_qa(p_nodes)
}

# Build Date safely (guaranteed Date class)
build_date <- function(year, yymmdd) {
  if (is.na(year) || is.na(yymmdd)) return(as.Date(NA))
  mm <- substr(yymmdd, 3, 4)
  dd <- substr(yymmdd, 5, 6)
  as.Date(sprintf("%04d-%02d-%02d", year, as.integer(mm), as.integer(dd)))
}

# ---- NEW: sanitize text before LaTeX/PDF ----
clean_for_latex <- function(x) {
  if (is.null(x) || length(x) == 0) return("")
  x <- paste0(x, collapse = "\n")
  
  # -------------------------
  # 1) Normalise spaces
  # -------------------------
  x <- stringr::str_replace_all(x, "\u00A0", " ")  # NBSP → space
  
  # -------------------------
  # 2) Fix Unicode math & punctuation (pdflatex killers)
  # -------------------------
  x <- stringr::str_replace_all(x, "\u2212", "-")     # minus sign
  x <- stringr::str_replace_all(x, "\u2013|\u2014", "-") # en/em dash
  x <- stringr::str_replace_all(x, "\u2018|\u2019", "'") # curly single quotes
  x <- stringr::str_replace_all(x, "\u201C|\u201D", "\"")# curly double quotes
  x <- stringr::str_replace_all(x, "\u2026", "...")      # ellipsis
  
  # -------------------------
  # 3) Fix Windows-1252 control characters (e.g. U+0092)
  # -------------------------
  x <- stringr::str_replace_all(x, "\u0092", "'")    # common broken quote
  x <- stringr::str_replace_all(x, "[\u0080-\u009F]", "")  # drop all C1 controls
  
  # -------------------------
  # 4) Remove combining diacritics (e.g. U+0336)
  # -------------------------
  x <- stringr::str_replace_all(x, "[\u0300-\u036F]", "")
  
  # -------------------------
  # 5) Remove invisible formatting chars
  # -------------------------
  x <- stringr::str_replace_all(
    x,
    "[\u200B-\u200F\u202A-\u202E\u2066-\u2069]",
    ""
  )
  
  x
}



# ---------------------------
# 1) READ INDEX + LAZYLOAD SNIPPETS
# ---------------------------
index_pg <- xml2::read_html(INDEX_URL)

snip_attr <- rvest::html_element(index_pg, "#lazyload-container") |>
  rvest::html_attr("data-snippets")

snip_urls <- stringr::str_split(snip_attr, ",", simplify = TRUE) |>
  as.character() |>
  stringr::str_trim() |>
  xml2::url_absolute(INDEX_URL) |>
  unique()

if (!is.na(YEAR_FILTER)) {
  snip_urls <- snip_urls[stringr::str_detect(
    snip_urls,
    paste0("/", YEAR_FILTER, "/html/index_include")
  )]
}

# ---------------------------
# 2) EXTRACT STATEMENT URLs (NO xml_find_all DISPATCH ISSUE)
# ---------------------------
extract_urls_from_snippet <- function(snip_url) {
  
  pg <- safe_read_html(snip_url)
  if (is.null(pg)) return(character(0))
  
  root <- xml2::xml_root(pg)
  
  hrefs <- tryCatch(
    xml2::xml_find_all(root, ".//a[@href]") |>
      xml2::xml_attr("href"),
    error = function(e) character(0)
  )
  
  hrefs <- hrefs[!is.na(hrefs) & nzchar(hrefs)]
  abs   <- xml2::url_absolute(hrefs, snip_url)
  
  abs |>
    unique() |>
    purrr::keep(~ stringr::str_detect(.x,
                                      "/press/press_conference/monetary-policy-statement/\\d{4}/html/(ecb\\.)?is\\d{6}(~[^/]+)?\\.en\\.html$"
    ))
}

all_mps_urls <- purrr::map(snip_urls, extract_urls_from_snippet) |>
  unlist(use.names = FALSE) |>
  unique()

# ---------------------------
# 3) BUILD URL TABLE WITH SAFE DATE COLUMN
# ---------------------------
mps_tbl <- tibble(url = all_mps_urls) |>
  mutate(
    year   = as.integer(stringr::str_match(
      url, "/monetary-policy-statement/(\\d{4})/")[,2]),
    yymmdd = stringr::str_match(
      url, "(?:ecb\\.)?is(\\d{6})(?:~|\\.)")[,2]
  ) |>
  mutate(
    date = mapply(build_date, year, yymmdd),
    date = as.Date(date)   # FORCE Date class
  ) |>
  arrange(desc(date)) |>
  slice_head(n = MAX_DOCS)

cat("Statements to process:", nrow(mps_tbl), "\n")

# ---------------------------
# 4) LOOP: SCRAPE → PDF ONLY
# ---------------------------
out <- vector("list", nrow(mps_tbl))

for (i in seq_len(nrow(mps_tbl))) {
  
  u <- mps_tbl$url[i]
  d <- as.Date(mps_tbl$date[i])  # SAFE
  tag <- if (!is.na(d)) format(d, "%Y-%m-%d") else paste0("item_", i)
  
  cat(sprintf("[%d/%d] %s\n", i, nrow(mps_tbl), u))
  if (i > 1) Sys.sleep(CRAWL_DELAY_S)
  
  txt <- scrape_one_mps(u)
  
  # ---- NEW: sanitize scraped text so LaTeX doesn't choke ----
  txt$statement <- clean_for_latex(txt$statement)
  txt$qa        <- clean_for_latex(txt$qa)
  
  rmd_tmp <- file.path(tempdir(), paste0("ECB_", tag, ".Rmd"))
  pdf_out <- paste0("ECB_MPS_", tag, ".pdf")
  
  writeLines(paste0(
    "---\n",
    "title: \"ECB Monetary Policy Statement (with Q&A)\"\n",
    "subtitle: \"", tag, "\"\n",
    "output: pdf_document\n",
    "geometry: margin=1in\n",
    "fontsize: 11pt\n",
    "---\n\n",
    "Source: ", u, "\n\n",
    "## Monetary Policy Statement\n\n",
    txt$statement,
    "\n\n\\newpage\n\n",
    "## Questions and Answers\n\n",
    txt$qa
  ), rmd_tmp)
  
  rmarkdown::render(
    rmd_tmp,
    output_file = pdf_out,
    output_dir  = OUT_DIR,
    quiet = TRUE
  )
  
  unlink(rmd_tmp)
  
  out[[i]] <- tibble(
    date = d,
    url  = u,
    pdf  = file.path(OUT_DIR, pdf_out)
  )
}

# ---------------------------
# 5) SAVE INDEX
# ---------------------------
index_df <- bind_rows(out)
readr::write_csv(index_df, file.path(OUT_DIR, "ECB_MPS_index.csv"))

cat("\nDONE.\nPDFs + index saved in:\n", OUT_DIR, "\n")
print(index_df)

beepr::beep(2)