
/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			clean_refxp.do
Description: 	Cleaning refs data from UCL, World Cup, Confederations Cup, Copa Libertadores, and the Asian Cup
				Then merging the data with the dta final_treatment_wrefs and creating a new .dta called:
				final_treatment_wrefxp_n_stadia

Notes: 			- Created by Camilo 
				- Last updated 9/15/2022
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear
set more off
version 16.0

*set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "C:\Users\uhrig_R\Dropbox\HFA and VAR"
cap cd "C:\Users\richa\Dropbox\HFA and VAR"
cap cd "C:\Users\camel\Dropbox\HFA and VAR"

* Raw data
global rawdata "1_data/0_raw"
global cleandata "1_data/1_clean"
global finaldata "1_data/2_final"
global refxpdata "1_data\4_refxp"

* Results
global graphs "4_results/Figures"
global regressions "4_results/Tables"

**************** Grab data scrapped by Richard on UCL, World Cup, 
**************** Confederations Cup, Copa Libertadores, and the Asian Cup  *****************
	use "${refxpdata}/refxp.dta", clear
	
	

* get rid of empty referees:
drop if referee == ""

* Change the name of the referees to UPPERCASE:
gen referee_upper = referee
replace referee_upper = upper(strtrim(stritrim(subinstr(referee_upper, "."," ",.))))

* Rather than replacing particular strings, seems better to replace the whole referee name: 

*afc:
tab referee_upper if country == "afc"

replace referee_upper = subinstr(referee_upper, "CÃ©SAR ARTURO RAMOS" , "CESAR ARTURO RAMOS",.)
replace referee_upper = subinstr(referee_upper, "IL'GIZ TANTASHEV" , "ILGIZ TANTASHEV",.)
replace referee_upper = subinstr(referee_upper, "PETER O'LEARY" , "PETER OLEARY",.)
replace referee_upper = subinstr(referee_upper, "RYÅ«JI SATÅ" , "RYUJI SATO",.)

tab referee_upper if country == "afc"


tab referee_upper if country == "clib"
* for clib makes more sense to replace some regular strings:
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã¡","A",.),"Ã©","E",.),"Ã­","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ãº","A",.),"Ã±","",.),"î","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ô","O",.),"ï","I",.),"Ã¡","A",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"î","I",.),"Ã","A",.),"Ã³","O",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã©","E",.),"Ã±","N",.),"Ãº","U",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã­","I",.),"Ö","O",.),"A©","E",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ı","I",.),"Ş","S",.),"ş","S",.)

tab referee_upper if country == "clib"

replace referee_upper = subinstr(referee_upper, "ANGEL ARTEAGA" , "ANGEL ARTEAGA",.)
replace referee_upper = subinstr(referee_upper, "AMER MACHADO" , "IMER MACHADO",.)
replace referee_upper = subinstr(referee_upper, "ASCAR MALDONADO" , "OSCAR MALDONADO",.)
replace referee_upper = subinstr(referee_upper, "ASCAR ROJAS" , "OSCAR ROJAS",.)
replace referee_upper = subinstr(referee_upper, "JHON ALVAREZ" , "JHON ALVAREZ",.)


tab referee_upper if country == "confc"
replace referee_upper = subinstr(subinstr(referee_upper,"A¶","O",.),"A§","Z",.)

replace referee_upper = subinstr(referee_upper, "DJAMEL HAA¯MOUDI" , "DJAMEL HAIMOUDI",.)
replace referee_upper = subinstr(referee_upper, "MILORAD MAÅ¾IÄ" , "MILORAD MAZIC",.)

tab referee_upper if country == "confc"


tab referee_upper if country == "ucl"
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"A®","I",.),"Å¾","Z",.),"A©","E",.)

replace referee_upper = subinstr(referee_upper, "ALIAKSEI KULBAKOU" , "ALIAKSEI KULBAKOV",.)

replace referee_upper = subinstr(referee_upper, "ARTUR DIAS" , "ARTUR SOARES DIAS",.)

replace referee_upper = subinstr(referee_upper, "BARIS ÅIMÅEK" , "BARIS SIMSEK",.)

replace referee_upper = subinstr(referee_upper, "CA¼NEYT AAKÄ±R" , "CUNEYT ZAKIR",.)
replace referee_upper = subinstr(referee_upper, "CA¼NEYT AAKÎ¹R" , "CUNEYT ZAKIR",.)

replace referee_upper = subinstr(referee_upper, "DANILO GRUJIÄ" , "DANILO GRUJIC",.)

replace referee_upper = subinstr(referee_upper, "FÄ±RAT AYDÄ±NUS" , "FIRAT AYDINUS",.)

replace referee_upper = subinstr(referee_upper, "MATEU LAHOZ" , "ANTONIO MATEU LAHOZ",.)

replace referee_upper = subinstr(referee_upper, "OVIDIU HAÅ£EGAN" , "OVIDIU HATEGAN",.)
replace referee_upper = subinstr(referee_upper, "OVIDIU HAÈEGAN" , "OVIDIU HATEGAN",.)

replace referee_upper = subinstr(referee_upper, "PAWEÅ" , "PAWEL",.)

replace referee_upper = subinstr(referee_upper, "SANDOR ANDA³-SZABA³" , "VIKTOR KASSAI",.)
 
replace referee_upper = subinstr(referee_upper, "SERGEI KARASEV" , "SERGEY KARASEV",.)

*I can't explain it but we actually need these 4 lines -- DO NOT TOUCH 
replace referee_upper = subinstr(referee_upper, "SLAVKO VINÄIÄ" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINÄIÄ" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINCIC" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINCIC" , "SLAVKO VINCIC",.)

replace referee_upper = subinstr(referee_upper, "TOMASZ MUSIAÅ" , "TOMASZ MUSIAL",.)

*FORGOT a couple:
replace referee_upper = subinstr(referee_upper, "MANUEL GRA¤FE" , "MANUEL GRAFE",.) 

replace referee_upper = subinstr(referee_upper, "ANTONIO ANTONIO MATEU LAHOZ" , "ANTONIO MATEU LAHOZ",.)

tab referee_upper if country == "ucl"

tab referee_upper if country == "wc"

*I came back after the merge to replace these and solidify the merge:
* and also to just correct some silly duplicateds in case we ever need them:


replace referee_upper = subinstr(referee_upper, "PAULO CESAR DE OLIVEIRA" , "PAULO CESAR OLIVEIRA",.)

replace referee_upper = subinstr(referee_upper, "RAAL OROSCO" , "RAUL OROZCO",.)
replace referee_upper = subinstr(referee_upper, "RAAL OROZCO" , "RAUL OROZCO",.)

replace referee_upper = subinstr(referee_upper, "SAAL LAVERNI" , "SAUL LAVERNI",.)

replace referee_upper = subinstr(referee_upper, "WILTON SAMPAIO" , "WILTON PEREIRA SAMPAIO",.)

replace referee_upper = subinstr(referee_upper, "CARLOS DEL CERRO" , "CARLOS DEL CERRO GRANDE",.)

replace referee_upper = subinstr(referee_upper, "CARLOS VELASCO" , "CARLOS VELASCO CARBALLO",.)


replace referee_upper = subinstr(referee_upper, "FERNANDEZ BORBALAN" , "DAVID FERNANDEZ BORBALAN",.)
replace referee_upper = subinstr(referee_upper, "DAVID FERNANDEZ" , "DAVID FERNANDEZ BORBALAN",.)
replace referee_upper = subinstr(referee_upper, "DIEGO FERNANDEZ" , "DAVID FERNANDEZ BORBALAN",.)

*AGAIN, not the brighest code, but gets the job done:
replace referee_upper = subinstr(referee_upper, "DAVID FERNANDEZ BORBALAN BORBALAN" , "DAVID FERNANDEZ BORBALAN",.)

replace referee_upper = subinstr(referee_upper, "JESAS GIL" , "JESUS GIL MANZANO",.)

replace referee_upper = subinstr(referee_upper, "JORGE DE SOUSA" , "JORGE SOUSA",.)

replace referee_upper = subinstr(referee_upper, "RICARDO DE BURGOS" , "RICARDO DE BURGOS BENGOETXEA",.)

replace referee_upper = subinstr(referee_upper, "UNDIANO MALLENCO" , "ALBERTO UNDIANO",.)

replace referee_upper = subinstr(referee_upper, "VELASCO CARBALLO" , "CARLOS VELASCO CARBALLO",.)

*ONE MORE:
replace referee_upper = subinstr(referee_upper, "CARLOS CARLOS VELASCO CARBALLO" , "CARLOS VELASCO CARBALLO",.)

*Right now, the dataset is ready to be merged.. Since the information we need is which ref name appears information
*certain seasons, let's just keep that (country with a different name though country_2, same for season) 


gen country_2 = country

gen season_2 = real(season)

*we only need a couple of columns:

keep referee_upper country_2 season_2

*we want to avoid repeating referees, for example Felix Brych appears in ucl,wc and confc:
* We decided to collapse by the first time they show up in any competition:
bys referee_upper (season_2): gen num_match = _n 
*bys referee_upper: egen first_year = min(season_2)
*collapse (first) season_2, by(referee_upper country_2)


keep if num_match == 1



* rename variable season_2, let's call it international_first
rename season_2 international_first

tab referee_upper

drop num_match

label variable country_2 "international tournament"

cd "C:\Users\camel\Dropbox\HFA and VAR\1_data\2_final"

merge 1:m referee_upper using  "final_treatment_wrefs", force generate(last_merge)

* CHECKING IF THE MERGE WAS SUCCESFULL:
* country by country!! tab referee_upper if last_merge == 1 & country == "afc""
tab referee_upper country_2 if last_merge == 1
* 

tab referee_upper if last_merge == 1 & country_2 == "clib"


tab referee_upper if last_merge == 1 & country_2 == "confc"

tab referee_upper if last_merge == 1 & country_2 == "ucl"

* I think the match is reliable.. 

drop if goal_diff == .


* Let's create a dummy for ref experience: 
* If the ref ever participated in an international tournament, the value experience will be 1, 0 otherwise:
* tiny problem with the season format, solved here:

gen len = strlen(string(international_first))

gen international_since =.
replace international_since = international_first if len>2
replace international_since = international_first*100 + international_first if len<=2

drop len

gen len = strlen(string(season))

gen season_comparison =.
replace season_comparison = season if len>2
replace season_comparison = season*100 + season if len<=2

drop len


gen experienced_ref = .
replace experienced_ref = 1 if international_since != . & international_since <= season_comparison
replace experienced_ref = 0 if experienced_ref == .

* br referee_upper season season_comparison international_since international_first experience


* A little more, get data from stadiums capacity in order to use it:

tab venue country

* Change the name of the venues to UPPERCASE:
gen venue_upper = venue
replace venue_upper = upper(strtrim(stritrim(subinstr(venue_upper, "."," ",.))))

*replace a bunch of garbage strings:
replace venue_upper = subinstr(venue_upper, "CENTRAL'NYJ STADION KAZAN'" , "CENTRAL'NYJ STADION",.)

replace venue_upper = subinstr(venue_upper, "STADION CENTRAL'NYJ" , "CENTRAL'NYJ STADION",.)

*STADION IMENI V I LENINA



replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"Ã¡","A",.),"Ã©","E",.),"Ã­","I",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"Ãº","A",.),"Ã±","",.),"î","I",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"ô","O",.),"ï","I",.),"Ã¡","A",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"î","I",.),"Ã","A",.),"Ã³","O",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"Ã©","E",.),"Ã±","N",.),"Ãº","U",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"Ã­","I",.),"Ö","O",.),"A©","E",.)
replace venue_upper = subinstr(subinstr(subinstr(venue_upper,"ı","I",.),"Ş","S",.),"ş","S",.)
replace venue_upper = subinstr(subinstr(venue_upper,"A¶","O",.),"A§","Z",.)

replace venue_upper = subinstr(subinstr(venue_upper,"á","A",.),"é","E",.)
replace venue_upper = subinstr(subinstr(venue_upper,"ã","A",.),"ê","E",.)
replace venue_upper = subinstr(subinstr(venue_upper,"í","I",.),"ú","U",.)
replace venue_upper = subinstr(subinstr(venue_upper,"ó","O",.),"A³","O",.)

replace venue_upper = subinstr(subinstr(venue_upper,"ƒ","",.),"ü","U",.)
replace venue_upper = subinstr(subinstr(venue_upper,"ğ","G",.),"İ","I",.)
*careful here, I think it's only for Fenerbahce
replace venue_upper = subinstr(subinstr(venue_upper,"ö","O",.),"ç","C",.)
replace venue_upper = subinstr(subinstr(venue_upper,"³","3",.),"'","",.)
replace venue_upper = subinstr(subinstr(venue_upper,"è","E",.),"'","",.)
replace venue_upper = subinstr(subinstr(venue_upper,"Â´","",.),"'","",.)
replace venue_upper = subinstr(subinstr(venue_upper,"Ü","U",.),"Ō","O",.)


replace venue_upper = subinstr(venue_upper, "ST MARY'S STADIUM" , "SAINT MARYS",.)


replace venue_upper = subinstr(venue_upper, "SEACON STADION - DE KOEL -" , "SEACON STADION DE KOEL",.)

*ERASE some extra words in brazilian stadia:
replace venue_upper = subinstr(venue_upper, "ESTADIO " , "",.)

*and change more stuff:
replace venue_upper = subinstr(venue_upper, "STADE PIERRE-MAUROY" , "STADE PIERRE MAUROY",.)
replace venue_upper = subinstr(venue_upper, "EDION STADIUM" , "HIROSHIMA BIG ARCH",.)
replace venue_upper = subinstr(venue_upper, "CORNELLA -EL PRAT" , "CORNELLA - EL PRAT",.)
replace venue_upper = subinstr(venue_upper, "ANZ STADIUM" , "STADIUM AUSTRALIA",.)
replace venue_upper = subinstr(venue_upper, "JOAQUIM AMERICO GUIMARAES" , "ARENA DA BAIXADA",.)
replace venue_upper = subinstr(venue_upper, "JOSE DO REGO MACIEL" , "DO ARRUDA",.)
replace venue_upper = subinstr(venue_upper, "JOSE PINHEIRO BORBA" , "BEIRA-RIO",.)
replace venue_upper = subinstr(venue_upper, "NACIONAL DE BRASILIA" , "NACIONAL MANE GARRINCHA",.)
replace venue_upper = subinstr(venue_upper, "RAIMUNDO SAMPAIO" , "INDEPENDENCIA",.)
replace venue_upper = subinstr(venue_upper, "URBANO CALDEIRA" , "VILA BELMIRO",.)

* #shermany:
replace venue_upper = subinstr(venue_upper, "COMMERZBANK-ARENA" , "COMMERZBANK ARENA",.)
replace venue_upper = subinstr(venue_upper, "JONATHAN-HEIMES-STADION AM BOLLENFALLTOR" , "MERCK-STADION AM BOLLENFALLTOR",.)
replace venue_upper = subinstr(venue_upper, "PREZERO ARENA" , "WIRSOL RHEIN-NECKAR ARENA",.)

* #denmark
replace venue_upper = subinstr(venue_upper,"ø","O",.)

* netherlands:
replace venue_upper = subinstr(venue_upper, "AMSTERDAM ARENA" , "JOHAN CRUYFF ARENA",.)
replace venue_upper = subinstr(venue_upper, "GALGENWAARD" , "STADION GALGENWAARD",.)
replace venue_upper = subinstr(venue_upper, "HITACHI CAPITAL MOBILITY STADION" , "EUROBORG",.)
replace venue_upper = subinstr(venue_upper, "IJSSELDELTA STADION" , "MAC3PARK STADION",.)
replace venue_upper = subinstr(venue_upper, "KYOCERA STADIUM" , "CARS JEANS STADION",.)
replace venue_upper = subinstr(venue_upper, "NOORDLEASE STADION" , "EUROBORG",.)
replace venue_upper = subinstr(venue_upper, "PARKSTAD LIMBURG STADION" , "PARKSTAD LIMBURG",.)
replace venue_upper = subinstr(venue_upper, "SEACON STADION DE KOEL" , "DE KOEL",.)
replace venue_upper = subinstr(venue_upper, "SPARTA-STADION HET KASTEEL" , "SPARTA STADION",.)
replace venue_upper = subinstr(venue_upper, "VAN DONGE & DE ROO STADION" , "STADION WOUDESTEIN",.)

* france:
replace venue_upper = subinstr(venue_upper, "ALTRAD STADIUM" , "STADE YVES-DU-MANOIR",.)
replace venue_upper = subinstr(venue_upper, "STADE ARMAND CESARI" , "STADE ARMAND-CESARI",.)
replace venue_upper = subinstr(venue_upper, "STADE AUGUSTE DELAUNE" , "STADE AUGUSTE-DELAUNE",.) 
replace venue_upper = subinstr(venue_upper, "STADE AUGUSTE-DELAUNE II" , "STADE AUGUSTE-DELAUNE",.)
replace venue_upper = subinstr(venue_upper, "STADE BOLLAERT" , "STADE BOLLAERT-DELEIS",.)
replace venue_upper = subinstr(venue_upper, "STADE BONAL" , "STADE AUGUSTE BONAL",.)
replace venue_upper = subinstr(venue_upper, "STADE CHABAN" , "STADE CHABAN-DELMAS",.)
replace venue_upper = subinstr(venue_upper, "STADE DE LA BEAUJOIRE - LOUIS FONTENEAU" , "STADE DE LA BEAUJOIRE",.)
replace venue_upper = subinstr(venue_upper, "STADE DE LA LICORNE" , "STADE CREDIT AGRICOLE LA LICORNE",.)
replace venue_upper = subinstr(venue_upper, "STADE DE LA MOSSON ET DU MONDIAL 98" , "STADE DE LA MOSSON",.)
replace venue_upper = subinstr(venue_upper, "STADE DE NICE" , "ALLIANZ RIVIERA",.)
replace venue_upper = subinstr(venue_upper, "STADE DU ROUDOUROU" , "STADE DE ROUDOUROU",.)
replace venue_upper = subinstr(venue_upper, "STADE FRANCOIS COTY" , "STADE FRANZOIS-COTY",.)
replace venue_upper = subinstr(venue_upper, "STADE GASTON GERARD" , "STADE GASTON-GERARD",.)
replace venue_upper = subinstr(venue_upper, "STADE GEOFFROY" , "STADE GEOFFROY-GUICHARD",.)
replace venue_upper = subinstr(venue_upper, "STADE JACQUES CHABAN-DELMAS" , "STADE CHABAN-DELMAS",.)
replace venue_upper = subinstr(venue_upper, "STADE JEAN" , "STADE JEAN BOUIN",.)
replace venue_upper = subinstr(venue_upper, "STADE MUNICIPAL DU RAY" , "STADE DU RAY",.)
replace venue_upper = subinstr(venue_upper, "STADE MUNICIPAL DU ROUDOUROU" , "STADE DE ROUDOUROU",.)
replace venue_upper = subinstr(venue_upper, "STADE SAINT-SYMPHORIEN" , "STADE SAINT SYMPHORIEN",.)
replace venue_upper = subinstr(venue_upper, "STADE VELODROME" , "ORANGE VELODROME",.)
replace venue_upper = subinstr(venue_upper, "STADE YVES ALLAINMAT" , "STADE DU MOUSTOIR",.)
replace venue_upper = subinstr(venue_upper, "STADE YVES ALLAINMAT - LE MOUSTOIR" , "STADE DU MOUSTOIR",.)
replace venue_upper = subinstr(venue_upper, "STADIUM LILLE METROPOLE" , "STADIUM LILLE-METROPOLE",.)

*greece:
replace venue_upper = subinstr(venue_upper, "KAFTANZOGLIO STADIO" , "STADIO KAFTANZOGLIO",.)
replace venue_upper = subinstr(venue_upper, "PANKRITIO STADIO" , "STADIO PANKRITIO",.)
replace venue_upper = subinstr(venue_upper, "STADIO GEORGIOS KARAISKAKI" , "KARAISKAKI",.)
replace venue_upper = subinstr(venue_upper, "STADIO HARILAOU KLEANTHIS VIKELIDIS" , "STADIO KLEANTHIS VIKELIDIS",.)
replace venue_upper = subinstr(venue_upper, "STADIO NEAS SMIRNIS" , "STADIO NEA SMYRNI",.)
replace venue_upper = subinstr(venue_upper, "STADIO THEODOROS KOLOKOTRONIS" , "GIPEDO ASTERA TRIPOLIS",.)
replace venue_upper = subinstr(venue_upper, "STADIO THODOROS VARDINOYANNIS" , "GIPEDO THEODOROS VARDINOGIANNIS",.)
replace venue_upper = subinstr(venue_upper, "STADIO TOUMBAS" , "STADIO TOUMBA",.)

*japan:
replace venue_upper = subinstr(venue_upper, "IAI STADIUM NIHONDAIRA" , "OUTSOURCING STADIUM NIHONDAIRA",.)
replace venue_upper = subinstr(venue_upper, "KAWASAKI TODOROKI STADIUM" , "TODOROKI ATHLETIC STADIUM",.)
replace venue_upper = subinstr(venue_upper, "KUMAGAYA ATHLETIC STADIUM" , "KUMAGAYA SPORTS PARK STADIUM",.)
replace venue_upper = subinstr(venue_upper, "NACK5 STADIUM OMIYA" , "OMIYA PARK STADIUM",.)
replace venue_upper = subinstr(venue_upper, "NAGANOKEN MATSUMOTODAIRA WIDE AREA PARK" , "MATSUMOTO STADIUM",.)
replace venue_upper = subinstr(venue_upper, "ND SOFT STADIUM YAMAGATA" , "ND SOFT STADIUM",.)
replace venue_upper = subinstr(venue_upper, "NHK SPRING MITSUZAWA FOOTBALL STADIUM" , "NHK SPRING MITSUZAWA STADIUM",.)
replace venue_upper = subinstr(venue_upper, "PANASONIC STADIUM SUITA" , "SUITA CITY STADIUM",.)
replace venue_upper = subinstr(venue_upper, "SANKYO FRONTIER KASHIWA STADIUM" , "KASHIWANOHA STADIUM",.)
replace venue_upper = subinstr(venue_upper, "SAPPORO ATSUBETSU PARK STADIUM" , "SAPPORO DOME",.)
replace venue_upper = subinstr(venue_upper, "TRANSCOSMOS STADIUM NAGASAKI" , "NAGASAKI ATHLETIC STADIUM",.)
replace venue_upper = subinstr(venue_upper, "YURTEC STADIUM SENDAI" , "STADIUM SENDAI",.)


*south korea:
replace venue_upper = subinstr(venue_upper, "ALPENSIA SKI JUMPING STADIUM" , "ALPENSIA STADIUM",.)
replace venue_upper = subinstr(venue_upper, "CHANGWON FOOTBALL CENTER" , "CHANGWON FOOTBALL STADIUM",.)
replace venue_upper = subinstr(venue_upper, "ALPENSIA SKI JUMPING STADIUM" , "ALPENSIA STADIUM",.)
replace venue_upper = subinstr(venue_upper, "SEONGNAM TANCHEON SPORTS COMPLEX" , "SEONGNAM SPORTS COMPLEX STADIUM",.)
replace venue_upper = subinstr(venue_upper, "STEELYARD STADIUM" , "STEELYARD",.)

*spain:
replace venue_upper = subinstr(venue_upper, "BALAIDOS" , "ABANCA BALAIDOS",.)
replace venue_upper = subinstr(venue_upper, "DE BALAIDOS" , "ABANCA BALAIDOS",.)
replace venue_upper = subinstr(venue_upper, "CAMPOS DE SPORT DE EL SARDINERO" , "EL SARDINERO",.)
replace venue_upper = subinstr(venue_upper, "CIUTAT DE VALENCIA" , "ESTADI CIUTAT DE VALENCIA",.)
replace venue_upper = subinstr(venue_upper, "CIUDAD DE VALENCIA" , "ESTADI CIUTAT DE VALENCIA",.)
replace venue_upper = subinstr(venue_upper, "CORNELLA - EL PRAT" , "CORNELLA-EL PRAT",.)
replace venue_upper = subinstr(venue_upper, "CORNELLA  -EL PRAT" , "CORNELLA-EL PRAT",.)
replace venue_upper = subinstr(venue_upper, "DE GRAN CANARIA" , "GRAN CANARIA",.)
replace venue_upper = subinstr(venue_upper, "DE LOS JUEGOS MEDITERRANEOS" , "MEDITERRANEO",.)
replace venue_upper = subinstr(venue_upper, "JUEGOS DEL MEDITERRANEO" , "MEDITERRANEO",.)
replace venue_upper = subinstr(venue_upper, "DE MESTALLA" , "MESTALLA",.)
replace venue_upper = subinstr(venue_upper, "DEL RAYO VALLECANO" , "DE VALLECAS TERESA RIVERO",.)
replace venue_upper = subinstr(venue_upper, "EL MADRIGAL" , "DE LA CERAMICA",.)
replace venue_upper = subinstr(venue_upper, "HELIODORO RODRIGUEZ" , "DE TENERIFE",.)
replace venue_upper = subinstr(venue_upper, "LA ROMAREDA" , "DE LA ROMAREDA",.)
replace venue_upper = subinstr(venue_upper, "MARTINEZ VALERO" , "MANUEL MARTINEZ VALERO",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE ANOETA" , "ANOETA",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE BUTARQUE" , "DE BUTARQUE",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE IPURAA" , "IPURUA",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE RIAZOR" , "RIAZOR",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL EL MOLINON" , "EL MOLINON",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL JOSE ZORRILLA" , "JOSE ZORRILLA",.)
replace venue_upper = subinstr(venue_upper, "NOU CAMP" , "CAMP NOU",.)
replace venue_upper = subinstr(venue_upper, "NUEVO RIAZOR" , "RIAZOR",.)
replace venue_upper = subinstr(venue_upper, "NUEVO ZORRILLA" , "JOSE ZORRILLA",.)
replace venue_upper = subinstr(venue_upper, "ONO ESTADI" , "IBEROSTAR ESTADI",.)
replace venue_upper = subinstr(venue_upper, "REYNO DE NAVARRA" , "EL SADAR",.)
replace venue_upper = subinstr(venue_upper, "SON MOIX" , "IBEROSTAR ESTADI",.)

*portugal:
replace venue_upper = subinstr(venue_upper, "ANTONIO COIMBRA DA MOTA" , "ANTONIO COIMBRA",.)
replace venue_upper = subinstr(venue_upper, "CIDADE DE BARCELOS" , "COMPLEXO DESPORTIVO DE BARCELOS",.)
replace venue_upper = subinstr(venue_upper, "DA CAPITAL DO MOVEL" , "DA MATA REAL",.)
replace venue_upper = subinstr(venue_upper, "DO ALGARVE" , "ALGARVE",.)
replace venue_upper = subinstr(venue_upper, "DO BESSA SECULO XXI" , "DO BESSA XXI",.)
replace venue_upper = subinstr(venue_upper, "DO PORTIMONENSE SC" , "MUNICIPAL DE PORTIMAO",.)
replace venue_upper = subinstr(venue_upper, "DO RESTELO" , "MUNICIPAL DE PORTIMAO",.)
replace venue_upper = subinstr(venue_upper, "DO RIO AVE FUTEBOL CLUBE" , "DOS ARCOS",.)
replace venue_upper = subinstr(venue_upper, "DO SPORT LISBOA E BENFICA" , "DA LUZ",.)
replace venue_upper = subinstr(venue_upper, "DOS BARREIROS" , "DO MARITIMO",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE BRAGANCA" , "MUNICIPAL DE BRAGA",.)
replace venue_upper = subinstr(venue_upper, "EFAPEL" , "CIDADE DE COIMBRA",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL DE MACHICO" , "DE MACHICO",.)
replace venue_upper = subinstr(venue_upper, "MUNICIPAL JOSE BENTO PESSOA" , "BENTO PESSOA",.)
replace venue_upper = subinstr(venue_upper, "PARQUE DESPORTIVO COMENDADOR JOAQUIM DE" , "P D COMENDADOR JOAQUIM DE ALMEIDA FREIT",.)

*england:
replace venue_upper = subinstr(venue_upper, "ANFIELD ROAD" , "ANFIELD",.)
replace venue_upper = subinstr(venue_upper, "BRITANNIA STADIUM" , "BET365 STADIUM",.)
replace venue_upper = subinstr(venue_upper, "CITY OF MANCHESTER" , "ETIHAD STADIUM",.)
replace venue_upper = subinstr(venue_upper, "KINGSTON COMMUNICATI" , "KCOM STADIUM",.)
replace venue_upper = subinstr(venue_upper, "KINGSTON COMMUNICATIONS STADIUM" , "KCOM STADIUM",.)
replace venue_upper = subinstr(venue_upper, "LOFTUS ROAD STADIUM" , "LOFTUS ROAD",.)
replace venue_upper = subinstr(venue_upper, "SAINT MARYS" , "ST MARYS STADIUM",.)
replace venue_upper = subinstr(venue_upper, "THE AMERICAN EXPRESS COMMUNITY STADIUM" , "AMEX STADIUM",.)
replace venue_upper = subinstr(venue_upper, "TOTTENHAM HOTSPUR STADIUM" , "WHITE HART LANE",.)
replace venue_upper = subinstr(venue_upper, "UPTON PARK" , "BOLEYN GROUND",.)
replace venue_upper = subinstr(venue_upper, "VICARAGE ROAD STADIUM" , "VICARAGE ROAD",.)

*russia:
replace venue_upper = subinstr(venue_upper, "ANZHI-ARENA" , "ANJI ARENA",.)
replace venue_upper = subinstr(venue_upper, "COSMOS ARENA" , "SAMARA ARENA",.)
replace venue_upper = subinstr(venue_upper, "GAZPROM ARENA" , "KRESTOVSKY STADIUM",.)
replace venue_upper = subinstr(venue_upper, "KRASNODAR STADIUM" , "STADION FK KRASNODAR",.)
replace venue_upper = subinstr(venue_upper, "RZD ARENA" , "LOKOMOTIV STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION AKADEMII FK KRASNODAR" , "STADION FK KRASNODAR",.)
replace venue_upper = subinstr(venue_upper, "STADION DINAMO" , "DINAMO STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION IM AKHMAT-KHAJI KADYROVA" , "ACHMAT ARENA",.)
replace venue_upper = subinstr(venue_upper, "STADION KUBAN" , "KUBAN STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION METALLURG" , "METALLURG STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION OLIMP 2" , "OLIMP-2 STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION PETROVSKIJ" , "PETROVSKY STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION TORPEDO IM EDUARDA STRELTSOVA" , "TORPEDO STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION ZVEZDA" , "ZVEZDA STADION",.)
replace venue_upper = subinstr(venue_upper, "STADION ZVEZDA ZAPASNOE POLE" , "ZVEZDA STADION",.)

*scotland:
replace venue_upper = subinstr(venue_upper, "KILMAC STADIUM AT DENS PARK" , "KILMAC STADIUM",.)
replace venue_upper = subinstr(venue_upper, "THE HOPE CBD STADIUM" , "NEW DOUGLAS PARK",.)
replace venue_upper = subinstr(venue_upper, "THE SIMPLE DIGITAL ARENA" , "PAISLEY 2021 STADIUM",.)
replace venue_upper = subinstr(venue_upper, "TULLOCH CALEDONIAN STADIUM" , "CALEDONIAN STADIUM",.)
replace venue_upper = subinstr(venue_upper, "TYNECASTLE STADIUM" , "TYNECASTLE PARK",.)

*italy:
replace venue_upper = subinstr(venue_upper, "STADIO CITTA  DEL TRICOLORE" , "MAPEI STADIUM",.)
replace venue_upper = subinstr(venue_upper, "STADIO COMUNALE LUIGI FERRARIS" , "STADIO LUIGI FERRARIS",.)
replace venue_upper = subinstr(venue_upper, "STADIO MARCANTONIO BENTEGODI" , "STADIO MARC ANTONIO BENTEGODI",.)

*switzerland:
replace venue_upper = subinstr(venue_upper, "STADE DE TOURBILLON" , "TOURBILLON",.)
replace venue_upper = subinstr(venue_upper, "STADION LETZIGRUND" , "LETZIGRUND",.)

*turkey:
replace venue_upper = subinstr(venue_upper, "19 MAYIS STADYUMU" , "ANKARA 19 MAYIS STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "4 EYLUL STADI" , "SIVAS ARENA",.)
replace venue_upper = subinstr(venue_upper, "5 OCAK STADYUMU" , "ADANA 5 OCAK STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "ANTALYA 100 YIL STADYUMU" , "ANTALYA ARENA",.)
replace venue_upper = subinstr(venue_upper, "ATATURK OLIMPIYAT STADI" , "ATATURK OLYMPIC STADIUM",.)
replace venue_upper = subinstr(venue_upper, "BASAKSEHIR FATIH TERIM STADIUM" , "FATIH TERIM STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "BORNOVA DOGANLAR STADI" , "BORNOVA STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "ESKISEHIR ATATURK STADYUMU" , "YENI ESKISEHIR STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "KADIR HAS SEHIR STADI" , "KADIR HAS STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "KALYON ARENA" , "GAZIANTEP STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "SENOL GUNES STADYUMU" , "MEDICALPARK STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "TURK TELEKOM ARENA" , "TURK TELEKOM STADIUM",.)
replace venue_upper = subinstr(venue_upper, "VODAFONE STADIUM" , "VODAFONE ARENA",.)

*and some residuals I skipped in the past: (AND/OR BAD CODING:)
replace venue_upper = subinstr(venue_upper, "STADION FEIJENOORD" , "DE KUIP",.)
replace venue_upper = subinstr(venue_upper, "STADION STADION GALGENWAARD" , "STADION GALGENWAARD",.)
replace venue_upper = subinstr(venue_upper, "MATMUT STADIUM DE GERLAND" , "STADE MUNICIPAL DE GERLAND",.)
replace venue_upper = subinstr(venue_upper, "STADE DU MOUSTOIR - LE MOUSTOIR" , "STADE DU MOUSTOIR",.)
replace venue_upper = subinstr(venue_upper, "STADE GEOFFROY-GUICHARD-GUICHAR" , "STADE GEOFFROY GUICHARD",.)
replace venue_upper = subinstr(venue_upper, "STADE GEOFFROY-GUICHARD" , "STADE GEOFFROY GUICHARD",.)
replace venue_upper = subinstr(venue_upper, "STADE GEOFFROY GUICHARDD" , "STADE GEOFFROY GUICHARD",.)

replace venue_upper = subinstr(venue_upper, "BEST AMENITY STADIUM" , "EKIMAE REAL ESTATE STADIUM",.)
replace venue_upper = subinstr(venue_upper, "DE ABANCA BALAIDOS" , "ABANCA BALAIDOS",.)
replace venue_upper = subinstr(venue_upper, "MANUEL MANUEL MARTINEZ VALERO" , "MANUEL MARTINEZ VALERO",.)
replace venue_upper = subinstr(venue_upper, "KCOM STADIUMONS STADIUM" , "KCOM STADIUM",.)
replace venue_upper = subinstr(venue_upper, "ANKARA ANKARA 19 MAYIS STADYUMU" , "ANKARA 19 MAYIS STADYUMU",.)
replace venue_upper = subinstr(venue_upper, "CORNELLA - EL PRAT" , "CORNELLA EL PRAT",.)

replace venue_upper = subinstr(venue_upper, "CORNELLA-EL PRAT" , "CORNELLA EL PRAT",.)

replace venue_upper = subinstr(venue_upper, "ZVEZDA STADION ZAPASNOE POLE" , "ZVEZDA STADION",.)


replace venue_upper = "STADIUM MUNICIPAL TOULOUSE" if venue_upper == "STADIUM" & home_team == "Toulouse"

replace venue_upper = "STADIUM MUNICIPAL TOULOUSE" if venue_upper == "STADIUM MUNICIPAL" & home_team == "Toulouse"

*replace venue_upper = "PARC DES SPORTS DANNECY" if strpos(venue_upper,"PARC DES SPORTS")


*NOW IT COMES THE MERGE:

cd "C:\Users\camel\Dropbox\HFA and VAR\1_data\1_clean"

merge m:1 venue_upper using  "stadia", force generate(stadia_merge)


*br venue_upper if country_stadium=="australia" & stadia_merge==2

*BECAUSE OF THIS LINE BELOW, I'M GOING UP AGAIN TO MATCH THE REMAINING ONES!!!!
* br venue_upper attendance if stadia_merge==1 & attendance==. 
*BECAUSE OF THIS LINE ABOVE, I'M GOING UP AGAIN TO MATCH THE REMAINING ONES!!!!

tab venue_upper if stadia_merge==1
* br venue_upper if stadia_merge==1


*check the quality of the new data: if attendace/capacity > 1 (I guess we can do 1.25), something weird: go back to solve it
gen crowd_pressure =.
replace crowd_pressure = attendance/capacity

sum crowd_pressure

tab venue_upper if crowd_pressure>1.25 & !missing(crowd_pressure)

*ok great, only 205...
gsort -crowd_pressure
br venue venue_upper attendance capacity crowd_pressure season date home_team away_team if crowd_pressure>1.25 & !missing(crowd_pressure)

*deal with this later, only 205 observations are compromised





* Ok, data ready to be saved and used in regressions:




save "C:\Users\camel\Dropbox\HFA and VAR\1_data\2_final\final_treatment_wrefs_experience_n_stadia.dta"


