# ==============================================================================
# Exercise 4 Final: VAT Gap Analysis by Firm Size
# Module: Data Reshaping and Joins
# ==============================================================================

# Load required libraries
library(tidyverse)
library(here)
library(data.table)

# OBJECTIVE: Identify top 5 retail firms with largest VAT gaps, by size category
# based on taxable income from CIT data

# ==============================================================================
# STEP 1: Load and Prepare VAT Data
# ==============================================================================

# TODO: Load the VAT data from data/Intermediate/panel_vat.csv
panel_vat <- 

# TODO: If vat data is wide (has vat_q1, vat_q2, etc columns), pivot to long
# TODO: Check the column names first
# TODO: If needed, use pivot_longer() to reshape


# TODO: Extract year from declaration_date and aggregate to firm-year level
vat_annual <- panel_vat %>%
  mutate(year = ) %>%
  group_by() %>%
  summarize(
    actual_vat = ,
    vat_inputs = ,
    vat_outputs = ,
    quarters_filed = ,
    .groups = "drop"
  )

# ==============================================================================
# STEP 2: Load and Prepare CIT Data
# ==============================================================================

# TODO: Load the CIT data from data/Intermediate/panel_cit.csv
panel_cit <- 

# TODO: Extract year and aggregate taxable income to firm-year level
cit_annual <- panel_cit %>%
  mutate(year = ) %>%
  group_by() %>%
  summarize(
    taxable_income = ,
    .groups = "drop"
  )

# ==============================================================================
# STEP 3: Join VAT with CIT Data
# ==============================================================================

# TODO: Use left_join() to combine VAT with CIT data
# TODO: Join by firm_id and year
vat_cit <- 

# ==============================================================================
# STEP 4: Load and Join Firm Characteristics
# ==============================================================================

# TODO: Load firm characteristics from data/Intermediate/dt_firms.csv
dt_firms <- 

# TODO: Join to add industry and other firm characteristics
vat_with_firms <- 

# ==============================================================================
# STEP 5: Create Firm Size Categories
# ==============================================================================

# TODO: Create size categories based on taxable_income
# TODO: Small: < 50,000
# TODO: Medium: 50,000 - 124,999
# TODO: Large: >= 125,000
vat_with_firms <- vat_with_firms %>%
  mutate(
    firm_size = case_when(
      
      
      
      is.na(taxable_income) ~ "Unknown",
      TRUE ~ "Unknown"
    )
  )

# ==============================================================================
# STEP 6: Calculate VAT Gap
# ==============================================================================

# TODO: Calculate expected VAT (assuming 15% rate on outputs)
# TODO: Calculate VAT gap = expected_vat - actual_vat
vat_analysis <- vat_with_firms %>%
  mutate(
    expected_vat = ,
    vat_gap = 
  )

# ==============================================================================
# STEP 7: Filter to Retail and Find Top 5 by Size
# ==============================================================================

# TODO: Filter to retail industry only
# TODO: Remove rows with missing vat_gap
# TODO: Group by firm_size
# TODO: Use slice_max() to get top 5 firms per size category
top_gaps <- vat_analysis %>%
  filter() %>%
  group_by() %>%
  slice_max() %>%
  ungroup()

# ==============================================================================
# STEP 8: Validate Results
# ==============================================================================

# TODO: Check total firms analyzed


# TODO: Check retail firms count


# TODO: Count top gaps by size


# TODO: Look at the top gaps


# ==============================================================================
# STEP 9: Save Results
# ==============================================================================

# TODO: Save full analysis to data/Final/vat_gap_analysis.csv


# TODO: Save top gaps to data/Final/top_vat_gaps_retail.csv

