/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			append.do
Description: 	Append the old and new leagues

Notes: 			- Created by Jeff 
				- Last updated 8/24/2022
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear

set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "C:\Users\camel\Dropbox\HFA and VAR"

* Raw data
global rawdata "1_data/0_raw"
global cleandata "1_data/1_clean"
global finaldata "1_data/2_final"

* Results
global graphs "4_results/Figures"
global regressions "4_results/Tables"

**************** Grab new data  *****************
	use "${cleandata}/additional_leagues", clear
	
	append using "${cleandata}/previous_leagues_new"
	
	drop if season == 1920

************************************************************************	
*********** Create variables for team strength and form ****************
************************************************************************

	gen win = 0
	replace win = 1 if goal_diff > 0 & goal_diff != . // same as T1G>T2G
	
	gen draw = 0
	replace draw = 1 if goal_diff == 0 & goal_diff != .
	
	gen points = 0
	replace points = 3 if win == 1 // just for the graph spacing
	replace points = 1 if draw == 1	
	
	gen total_goals = home_score+away_score
	gen total_xgoals = home_xg + away_xg
	
	rename points home_points
	gen away_points = 0
	replace away_points = 3 if home_points == 0
	replace away_points = 1 if home_points == 1

	encode home_team, gen(team_list) // value for each home team
	
	* Create variables before looping and filling them all out *
	gen away_list = . // for later I'm going to need a team difference (this should make it easier)
	gen past_points_home = .
	gen past_points_away = .
	gen recent_points_home_4 = 0
	gen recent_points_away_4 = 0
	gen recent_points_home_8 = 0
	gen recent_points_away_8 = 0
	gen prev_season_points_home = .
	gen prev_season_points_away = .
	
	* Summarize to grab max *
	sum team_list // do this so I can grab the maximum
	
************* Loop through each team to fill out points by season ***********
	forvalues x =1(1)`r(max)' { // loop through all team numbers
		gen temp_name = home_team if team_list == `x' // grab the team name for that #
		egen name = mode(temp_name) // create a variable = that name (for all observations)
		gen indic_`x'=0 
		replace indic_`x' = 1 if name == home_team | name == away_team //indicator = 1 if that team is playing
		
		replace away_list = `x' if name == away_team // make it that # if it is the away team
		
		* Sorts so it is 000000000111111111 and then Seasons and wks (for that team)
		sort indic_`x' season wk
		
		* Points for that team in that week *
		gen points_`x' = 0 if indic_`x' == 1
		replace points_`x' = home_points if team_list == `x'
		replace points_`x' = away_points if away_list == `x'
		
		* Unclear what to do across seasons. For now it will be the last 4 or 8 the team has played this season (did not want to go across seasons since that doesn't make sense for "form")
		* The indicator check is just that this is still that team. Since it is sorted, just have to check that the farthest back obs (e.g., 4 or 8) is the same team
		replace recent_points_home_4 = points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' & indic_`x'[_n-4]==1 & season == season[_n-4]
		replace recent_points_away_4 = points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' & indic_`x'[_n-4]==1 & season == season[_n-4]
		replace recent_points_home_8 = points_`x'[_n-8]+points_`x'[_n-7]+points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' & indic_`x'[_n-8]==1 & season == season[_n-8]
		replace recent_points_away_8 = points_`x'[_n-8]+points_`x'[_n-7]+points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' & indic_`x'[_n-8]==1 & season == season[_n-8]
		
		* Now dealing with these other cases (say week 4 then there have only been 3 games before)
		* Week 8 there are 7 games before it	
		* Conditioning on wk is enough since if it is wk 8 then we know there are 7 games before it for that team
		* Could have made a loop (using forvalues)

		* Week 8
		replace recent_points_home_8 = points_`x'[_n-7]+points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' &  wk == 8
		replace recent_points_away_8 = points_`x'[_n-7]+points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' & wk == 8		
		
		* Week 7
		replace recent_points_home_8 = points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' & wk == 7
		replace recent_points_away_8 = points_`x'[_n-6]+points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' &  wk == 7		

		* Week 6
		replace recent_points_home_8 = points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' &  wk == 6
		replace recent_points_away_8 = points_`x'[_n-5] ///
									+ points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' &  wk == 6	
									
		* Week 5 (in this case it is equal to the recent points back 4 games)
		replace recent_points_home_8 = points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' &  wk == 5
		replace recent_points_away_8 = points_`x'[_n-4]+points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' &  wk == 5		
		
		* Week 4
		replace recent_points_home_4 = points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' & wk == 4	
		replace recent_points_away_4 = points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' & wk == 4	
		
		replace recent_points_home_8 = points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' &  wk == 4
		replace recent_points_away_8 = points_`x'[_n-3]+points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' &  wk == 4		
		
		* Week 3
		replace recent_points_home_4 = points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' & wk == 3	
		replace recent_points_away_4 = points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' & wk == 3	
		
		replace recent_points_home_8 = points_`x'[_n-2]+points_`x'[_n-1] if team_list == `x' &  wk == 3
		replace recent_points_away_8 = points_`x'[_n-2]+points_`x'[_n-1] if away_list == `x' &  wk == 3	
		
		* Week 2
		replace recent_points_home_4 = points_`x'[_n-1] if team_list == `x' & wk == 2	
		replace recent_points_away_4 = points_`x'[_n-1] if away_list == `x' & wk == 2	
		
		replace recent_points_home_8 = points_`x'[_n-1] if team_list == `x' &  wk == 2
		replace recent_points_away_8 = points_`x'[_n-1] if away_list == `x' &  wk == 2	
		
			
		* This tracks a team's points through the season 
		* Checked it for team "1" which is Alaves and it matched the end-season points along with a few intervals I looked at
		replace points_`x' = points_`x'+points_`x'[_n-1] if indic_`x' == 1 & indic_`x'[_n-1] == 1 & season == season[_n-1]
		
		* Separate variable that is the cumulative points minus that result
		replace past_points_home = points_`x' - home_points if team_list == `x'
		replace past_points_away = points_`x' - away_points if away_list == `x'
		
		gen temp_prev_season_points = points_`x'[_n-1] if season!=season[_n-1] & indic_`x' == 1 & indic_`x'[_n-1] == 1 // in hindsight this was not necessary since I could have just grabbed the max
		
		bys indic_`x' season: egen prev_seas_points_`x' = max(temp_prev_season_points)
		
		replace prev_season_points_home = prev_seas_points_`x' if team_list == `x'
		replace prev_season_points_away = prev_seas_points_`x' if away_list == `x'
		
		drop name temp_name temp_prev_season_points 
	}
	
	******** Checked all of this by hand for the first season in Australia (13/14) *******

************** Points Difference Variables **********************
	
	* Generate the difference in points variables (recent or over whole season)
	gen diff_points = past_points_home - past_points_away
	gen recent_points_diff_4 = recent_points_home_4 - recent_points_away_4 
	gen recent_points_diff_8 = recent_points_home_8 - recent_points_away_8 
	
	* tab diff_points if wk == 1 // should be 0 (it is)
	
	gen better_home = 0
	replace better_home = 1 if diff_points > 0 
	
	* Grabbing from previous season has issues if it is not the first season (in the analysis it would drop this first season)
	* Note that I checked a few EPL games for the 2010-2011 season to make sure they matched the previous season points
	* Replace with lowest points 
	bys country: egen min_season = min(season)
	bys country season: egen lower_bound = min(prev_season_points_home) // grabs lowest points from previous season that "stayed up"
	replace prev_season_points_home = lower_bound if prev_season_points_home == . & season != min_season // make the promoted teams the lowest points 
	replace prev_season_points_away = lower_bound if prev_season_points_away == . & season != min_season // make the promoted teams the lowest points 
	
	gen diff_prev_season_points = prev_season_points_home - prev_season_points_away	
	
	
	* Dropping a bunch of intermediate variables
	drop points_* prev_seas_points_* indic_*
	
************************************************************************	
*********** End of variables for team strength and form ****************
************************************************************************	
	
	save "${finaldata}/final", replace	
	