#----------------------------------------------------------------------------------------------------#
# DATA GENERATION EXERCISE 3 Answer Key
#----------------------------------------------------------------------------------------------------#

You job is to load in the Canadian Occupational Prestige Data found from this website:

http://socserv.mcmaster.ca/jfox/Books/Companion/data.html

prestige2<-read.table("http://socserv.mcmaster.ca/jfox/Books/Companion/data/Prestige.txt")

##We are calling this prestige2 because we already have a dataset called prestige in our workspace

Part 1: Here a few questions you should be able to answer. In addition to providing the answer, be able to show the code you used to arrive at the answer. 

1.	How many occupations are listed in the dataset?

dim(prestige2)[1]
102

2.	Are there any duplicate census codes?

##This command tells you that yes, there are, because there are only 101 unique codes but 102 rows
length(unique(prestige2$census))

#To figure out which one, you can either look manually:
table(prestige2$census)
#Yes, 8215

##Or like this:
table(prestige2$census) -> censustable
which(censustable>1)

3.	Which occupation has the highest percentage of women?

##You can do this visually
prestige2[order(prestige2$women),]
#Secretaries

#You can also look this way
prestige2[prestige2$women==max(prestige2$women),]

4.	Which 10 occupations have the highest average level of education?

##To see the whole dataset
prestige2[order(prestige2$education),]

##To see just the top ten
prestige2[order(prestige2$education, decreasing=T)[1:10],]

#You can either write them out by hand
University Teachers, Physicians, Veterinarians, Lawyers, Physicists, Architects, Vocational Counsellors, Pharmacists, Biologists, Secondary School Teachers

#or have R print out the vector of row names
row.names(prestige2[order(prestige2$education, decreasing=T)[1:10],]) -> higheducnames

5.	How many "types" of occupations are there?

table(prestige2$type)
3 types: bc, prof, & wc

6.	Which occupations, on average, seem to require at least a high school degree (13 years of school)?

prestige[which(prestige2$education>=13),]


7.	Which blue collar jobs pay over $8,000?

#You can either make a subset
prestige2.bc8000<-subset(prestige2, type=="bc" & income>8000, select=c(education, income, women, prestige, census, type))

##Or call up the data on that criteria
prestige2[which(prestige2$income>8000 & prestige2$type=="bc"),]

Firefighters, policemen, tool die makers, electrical linemen, construction foremen, train engineers

8.	What is the mean prestige level?

mean(prestige2$prestige)
46.83333

Part 2: Here are some additional things you should be able to do.

1.	Recode a variable in a meaningful way

prestige2$education<-round(prestige2$education, digits=0)

	#recoding education to be rounded to full years#

2.	Rename one of the variables. 

names(prestige2)[1]<-c("educ")

	#rename education as educ so it’s less to type later

3.	Make a table showing key statistics for the variables. 

vec<-c(1,2,3,4,5)
summarystat<-as.data.frame(matrix(NA, nrow=length(vec),ncol=5))
names(summarystat)<-c("Mean", "SD", "Min", "Max", "N")
row.names(summarystat)<-names(prestige2)[vec]
summarystat

for (i in 1:length(vec)){

summarystat[i, 1] <- mean(prestige2[,vec[i]], na.rm=T)
summarystat[i, 2] <- sd(prestige2[,vec[i]], na.rm=T)
summarystat[i, 3] <- min(prestige2[,vec[i]], na.rm=T)
summarystat[i, 4] <- max(prestige2[,vec[i]], na.rm=T)
summarystat[i, 5] <- length(which(!is.na(prestige2[,vec[i]])))

}

summarystat

               Mean          SD    Min      Max   N
educ       10.72549    2.718317    6.0    16.00 102
income   6797.90196 4245.922227  611.0 25879.00 102
women      28.97902   31.724931    0.0    97.51 102
prestige   46.83333   17.204486   14.8    87.20 102
census   5401.77451 2644.993215 1113.0  9517.00 102

4.	Formulate a hypothesis and run a t.test to assess it

Null Hypothesis: There is no relationship between the gender composition of an occupation and average income.

Alternate Hypothesis: There is a relationship between income and occupations. Occupations with more than 50% women have lower incomes than those with fewer than 50% women.

prestige2$womendum<-NA
prestige2$womendum[which(prestige2$women>=50)]<-1
prestige2$womendum[which(prestige2$women<50)]<-0

t.test(prestige2$income ~ prestige2$womendum)

  prestige$income by prestige$womendum
t = 6.5725, df = 99.919, p-value = 2.27e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2713.235 5059.553
sample estimates:
mean in group 0 mean in group 1 
       7826.653        3940.259

Because the p-value is less than .05, we can reject the null hypothesis and suggest that there is a statistically significant difference in income between occupations with more than 50% women and occupations with less than 50% income, such that occupations with more than 50% women (mean income=7826) have lower incomes than those with less than 50% women (mean income=3940).