/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			kolmo_smirnov_by_coutry.do
Description: 	Kolmogorov–Smirnov test broken down by country

Notes: 			- Created by Camilo 
				- Last updated 11/19/2023
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear
set more off
version 16.0

* ssc install blindschemes
set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "D:/Dropbox/HFA and VAR"
cap cd "C:\Users\uhrig_R\Dropbox\HFA and VAR"
cap cd "C:\Users\richa\Dropbox\HFA and VAR"
cap cd "C:\Users\camel\Dropbox\HFA and VAR"

* Raw data
global rawdata "1_data/0_raw"
global cleandata "1_data/1_clean"
global finaldata "1_data/2_final"

* Results
global graphs "4_results/Figures/histograms_by_country"
global regressions "4_results/Tables"

**************** Grab new data  *****************
	use "${finaldata}/final_treatment", clear
	

	gen figure_label = "No VAR"
	replace figure_label = "VAR" if VAR == 1
	
	hist goal_diff, by(figure_label, note("")) bcolor(blue) w(1) start(-10.5) frac
	
	*graph export "${graphs}/goal_diff_hist.png", replace
	
	sum goal_diff if VAR == 0
	local no = `r(mean)'
	sum goal_diff if VAR == 1
	local yes = `r(mean)'
	
	twoway (hist goal_diff if VAR == 0, xline(`no' `yes') bcolor(blue%50) w(1) start(-10.5) frac) ///
	(hist goal_diff if VAR == 1, bcolor(red%50) w(1) start(-10.5) frac ///
	legend(label(1 "No VAR") label(2 "VAR")))
	
		
	
	* Create a vector of unique countries
	*but
	*erase the ones that do not have VAR = 1 (very inefficient but wasted too much time by now)
	*tab VAR country
	local selected_countries "australia brazil bundesliga dutch france la_liga portugal serie_a turkey"
		
	foreach country in `selected_countries' {
    local filename "${graphs}/goal_diff_hist_`country'.png"

    hist goal_diff if VAR == 0 & country == "`country'", by(figure_label, note("")) bcolor(blue) w(1) start(-10.5) frac

    sum goal_diff if VAR == 0 & country == "`country'"
    local no = `r(mean)'

    sum goal_diff if VAR == 1 & country == "`country'"
    local yes = `r(mean)'

    twoway (hist goal_diff if VAR == 0 & country == "`country'", xline(`no' `yes') bcolor(blue%50) w(1) start(-10.5) frac) ///
    (hist goal_diff if VAR == 1 & country == "`country'", bcolor(red%50) w(1) start(-10.5) frac ///
    legend(label(1 "No VAR") label(2 "VAR")))

    graph export "`filename'", replace
}
	
	
	* Save scalars to an Excel file
	local excel_file "${graphs}/all_countries_KS_test.xlsx"
	putexcel set "`excel_file'", replace
	
	
	
	foreach country in `selected_countries' {
		
		* Subset the data for the current country
		keep if country == "`country'"
		
		ksmirnov goal_diff, by(VAR) exact
    

			
  	*Restore dataset
	use "${finaldata}/final_treatment", clear
}

	
	
	
	* Capture the results in scalars
	*scalar D0 = r(D_1)
	*scalar p_value0 = r(p_1)
	*scalar D1 = r(D_2)
	*scalar p_value1 = r(p_2)
	*scalar D_combined = r(D)
	*scalar p_value_combined = r(p)
	
	* Save scalars to an Excel file
	*putexcel set "${graphs}/aberte.xlsx", replace

	* Write scalar values to Excel cells
	*putexcel A1=("Variable 1") B1=("Variable 2") C1=("Variable 3") D1=("Variable 4")
	*putexcel A2=D0 B2=p_value0 C2=D1 D2=p_value1

	* Close the Excel file
	*putexcel close
	
	*************************** Kolmogorov - Smirnov test, equality of distributions ************
	
	
	* ksmirnov goal_diff, by(VAR) exact
	
	* If you run the previous line, it takes a lot of time (~8 mins) until the results show up:
*Two-sample Kolmogorov–Smirnov test for equality of distribution functions
*Smaller group             D     p-value      Exact
* -------------------------------------------------
* 0                    0.0063       0.756
* 1                   -0.0056       0.797
* Combined K-S         0.0063       0.999          .

* We fail to reject the null hypothesis of equality of distributions

* Maybe valuable: 
* kdensity goal_diff if VAR == 1, plot(kdensity goal_diff if VAR == 0) legend(label(1 "VAR") label(2 "No VAR") rows(1)) 


	
