/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			import_refxp.do
Description: 	Imports the text files, cleans them, and saves the data as a dta. used for merge with the actual match data, based on Jeff's original code to import the leagues

Notes: 			- Created by Richard 
				- Last updated 9/15/2022
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear

cap set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "C:/Users/uhrig_R/Dropbox/HFA and VAR"

* Raw data
global rawdata "1_data/4_refxp"
global cleandata "1_data/1_clean"

* Results
global graphs "4_results/Figures"
global regressions "4_results/Tables"


************************ Importing First ************************************
	local leagues "ucl"
	local leagues_inv "afc wc confc clib"

	tempfile base
	save `base', emptyok

	foreach l of local leagues {
		forvalues i = 13(1)18 {
			clear
			local j = `i'+1	
			cap import delimited "${rawdata}/`l'_`i'_`j'.txt", stringcols(_all)
			gen country = "`l'"
			gen season = "`i'`j'"
			cap assert _N == 0
			if _rc != 0 {
				append using `base'
				save `base', replace
			}		
		}
	}

	foreach l of local leagues_inv {
		forvalues i = 13(1)19 {
			clear
			cap import delimited "${rawdata}/`l'_`i'.txt", stringcols(_all)
			gen country = "`l'"
			gen season = "`i'"
			cap assert _N == 0
			if _rc != 0 {
				append using `base'
				save `base', replace
			}		
		}		
	}

	use `base', clear

************************ Cleaning now ************************************
	*drop if wk == "" // these are missing observations or playoffs/relegation matches
	*drop if round == "Championship round" | round == "European competition play-offs" | round == "Relegation round"
	drop matchreport
	
	***** Cleaning or creating new variables *****
	
	* First do dates *
	gen date_new = date(date, "YMD")
	format date_new %td
	drop date
	rename date_new date
	
	*Encode day of the week *
	encode day, gen(day_num)
	drop day
	rename day_num day
	
	* Indicator for if there are notes *
	gen abnormal_match = 0
	replace abnormal_match = 1 if notes != ""
	
	* Time (note that some matches do not have a start time)
	split time, p("(")
	split time1, p(":")
	rename time11 hour_start
	rename time12 minute_start
	drop time1 time2 time
	
	* Score *
	*split score, p("–")
	*rename score1 home_score
	*rename score2 away_score
	drop score
	
	*rename home home_team
	*rename away away_team
	
	*cap rename xg home_xg // note this is only there for France in recent years
	*cap rename v9 away_xg
	
	
	* Destring variables *
	*destring hour_start minute_start wk attendance home_score away_score season home_xg away_xg, replace
	
	*gen goal_diff = home_score - away_score //clear HFA (sanity check)	
	*cap gen goal_diff_xg = home_xg - away_xg //clear HFA (sanity check)	
	
	* Making everything look nice (order and label)
	*order country season wk date day goal_diff goal_diff_xg home_team home_score home_xg away_team away_score away_xg referee venue attendance hour_start minute_start abnormal_match notes round
	
	cap la var country "Country of match"
	cap la var season "Season played - single number if inverted calendar"
	cap la var wk "Week of match"
	cap la var date "Date of match"
	cap la var day "Day of Week"
	cap la var goal_diff "Home - away goals"
	cap la var goal_diff_xg "Home - away xGoals"
	cap la var home_team "Home team (string)"
	cap la var home_score "Home team goals scored"
	cap la var home_xg "Home xGoals (only for recent years)"
	cap la var away_team "Away team (string)"
	cap la var away_score "Away team goals scored"
	cap la var away_xg "Away xGoals (only for recent years)"
	cap la var referee "Center ref for match"
	cap la var venue "Venue"
	cap la var attendance "Attendance"
	cap la var hour_start "Hour of match start"
	cap la var minute_start "Minute the match started (time is hour+minute)"
	cap la var abnormal_match "Indicator for if there is a note"
	cap la var notes "Notes associated with match"
	cap la var round "Round of match (regular season)"
	
	* Save space *
	compress
	
	save "${rawdata}/refxp", replace

