Plotting and linear models

To run your code, click run. It will let you know if your code is correct or not, and will offer hints on what to fix if needed. If you are stuck, you can see the solution by clicking Solution.

Exercise 1: Calculate a Linear model

{"language":"r","pre_exercise_code":"library(dplyr) # loads dplyr for data manipulation\ndf <- data.frame(tic=rep(\"WMT\",194),\n                 fyearq=c(1970,1970,1970,1970,1971,1971,1971,1971,1972,1972,1972,1972,1973,1973,1973,1973,1974,1974,1974,1974,1975,1975,1975,1975,1976,1976,1976,1976,1977,1977,1977,1977,1978,1978,1978,1978,1979,1979,1979,1979,1980,1980,1980,1980,1981,1981,1981,1981,1982,1982,1982,1982,1983,1983,1983,1983,1984,1984,1984,1984,1985,1985,1985,1985,1986,1986,1986,1986,1987,1987,1987,1987,1988,1988,1988,1988,1989,1989,1989,1989,1990,1990,1990,1990,1991,1991,1991,1991,1992,1992,1992,1992,1993,1993,1993,1993,1994,1994,1994,1994,1995,1995,1995,1995,1996,1996,1996,1996,1997,1997,1997,1997,1998,1998,1998,1998,1999,1999,1999,1999,2000,2000,2000,2000,2001,2001,2001,2001,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,2004,2004,2005,2005,2005,2005,2006,2006,2006,2006,2007,2007,2007,2007,2008,2008,2008,2008,2009,2009,2009,2009,2010,2010,2010,2010,2011,2011,2011,2011,2012,2012,2012,2012,2013,2013,2013,2013,2014,2014,2014,2014,2015,2015,2015,2015,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018),\n                 fqtr=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2),\n                 revtq=c(7.267,9.939,11.47,15.61,12.758,18.207,19.205,27.844,21.011,29.918,32.683,41.277,32.737,37.655,42.098,55.071,46.355,56.177,59.462,74.215,64.65,78.852,86.108,110.721,93.971,115.186,121.875,147.775,122.655,152.381,172.637,230.783,167.274,204.085,221.982,306.957,238.569,291.685,300.747,417.175,314.575,372.598,401.842,554.184,434.251,533.653,634.476,842.617,665.229,788.705,827.767,1094.551,854.734,1098.942,1166.594,1546.639,1234.768,1508.533,1583.573,2073.983,1655.661,1934.358,2087.533,2773.936,2344.253,2770.276,2949.008,3845.557,3211.761,3732.237,4015.017,5000.238,4282.012,4845.133,4991.113,6530.734,5373.258,6046.41,6283.496,8107.484,6768.191,7543.508,7930.949,10358.898,9280.566,10339.898,10627.5,13638.797,11649.398,13028.398,13683.797,17122,13920.398,16236.5,16826.898,20360.5,17686.098,19942.297,20417.699,24447.797,20440,22723,22914,27550,22772,25587,25644,30856,25409,28386,28777,35386,29819,33521,33509,40785,35129,38913,40899,51868,43447,46588,46181,57079,48565,53187,53185,64735,52126,56781,55765,66905,57224,63231,63036,75190,65434,70516,69282,82900,70755,76697,75397,89252,79675,85430,84467,99078,86378,92827,91794,107289,94940,102342,98345,108747,94214,100876,99373,113622,99811,103726,101952,116360,104189,109366,110226,123169,113010,114065,113800,127776,114070,116830,115688,129706,114960,120125,119001,131565,114826,120229,117408,129667,115904,120319,118179,130936,117542,122968,123179,136267,122690,128028))\ndf <- df %>% mutate(revtq_lag=lag(revtq))","sample":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n\n# Split the data into training and testing data\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\n# Calculate a linear model of revenue on lag revenue, store it in mod1\n# Use the training data only\nmod1 <- lm( ~ , data=)\n\n# Print out a summary of the model using summary\n\n\n#END","solution":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n\n# Split the data into training and testing data\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\n# Calculate a linear model of revenue on lag revenue, store it in mod1\n# Use the training data only\nmod1 <- lm(revtq ~ revtq_lag, data=training)\n\n# Print out a summary of the model using summary\nsummary(mod1)\n\n#END","sct":"# Template based on https://www.rdocumentation.org/packages/testwhat/versions/4.1.1\n# Check if something is explicitly typed\n\ntest_student_typed('revtq ~ revtq_lag',not_typed_msg='Make sure to regress `revtq` on `revtq_lag`.')\ntest_student_typed('data=training',not_typed_msg='Make sure to using the `training` data for the test.')\ntest_expression_output(\"mod1\", incorrect_msg=\"Your regression model isn't quite right.\")\ntest_student_typed('summary(mod1)',not_typed_msg='Your model is correct, but did you remember to summarize it with `summary()`?')\n\n# test_student_typed('x <- 2', not_typed_msg='')\n\n# Check if function was used in input code\n# test_function('c',incorrect_msg='')  \n\n# Requires an object `x` to have the same value as the solution\n# test_object(\"x\",incorrect_msg = \"\",undefined_msg = \"\")  \n\n# Requires an onject with the same value of `x` in the solution\n# test_an_object(\"x\",undefined_msg=\"\")\n\n# Checks if output of student's code contains given evaluated expression\n# test_output_contains(\"x\",incorrect_msg = \"\")\n\n# Check if a vector of predefined objects are unchanged\n# test_predefined_objects(c('x','y'),incorrect_msg=\"Don't onverwrite the predefined variables\")\n\n# Checks for a regex pattern in trhe output\n# test_output_regex(pattern,fixed=F, times=1, incorrect_msg='')\n\n# Can check an arbitrary expression across both solution and student code\n#test_expression_output(\"typeof(company_name)\", incorrect_msg=\"Did you store textual data in `company_name`?\")\n\ntest_error()\nsuccess_msg(\"Awesome!\")\n\n# Other functions to note:\n#     - test_or(a,b) -- checks if either test a or test b pass\n#     - test_ggplot() -- can check if plots are correct\n#     - test_function() -- can also check included parameters\n#     - test_loop() -- checking for and while loops\n#     - test_library_function('package', not_called_msg='',incorrect_msg='')\n#     - test_if_else() -- checking if statements\n#     - test_expression_error() -- can check if functions are properly defined\n#     - test_operator('operator',), not_called_msg='',incorrect_msg='')\n#     - test_function_definition() -- rigorously check defined function\n#     - test_data_frame() -- check if dataframe [columns] are equivalent\n#     - test_function_result, test_expression_result"}

Exercise 2: Using ggplot()

Note: it appears that Datacamp-light can’t test plot accuracy at the time of writing this. As such, please manually check the solution once you think you know the answer, or if you get stuck.

{"language":"r","pre_exercise_code":"library(dplyr) # loads dplyr for data manipulation\ndf <- data.frame(tic=rep(\"WMT\",194),\n                 fyearq=c(1970,1970,1970,1970,1971,1971,1971,1971,1972,1972,1972,1972,1973,1973,1973,1973,1974,1974,1974,1974,1975,1975,1975,1975,1976,1976,1976,1976,1977,1977,1977,1977,1978,1978,1978,1978,1979,1979,1979,1979,1980,1980,1980,1980,1981,1981,1981,1981,1982,1982,1982,1982,1983,1983,1983,1983,1984,1984,1984,1984,1985,1985,1985,1985,1986,1986,1986,1986,1987,1987,1987,1987,1988,1988,1988,1988,1989,1989,1989,1989,1990,1990,1990,1990,1991,1991,1991,1991,1992,1992,1992,1992,1993,1993,1993,1993,1994,1994,1994,1994,1995,1995,1995,1995,1996,1996,1996,1996,1997,1997,1997,1997,1998,1998,1998,1998,1999,1999,1999,1999,2000,2000,2000,2000,2001,2001,2001,2001,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,2004,2004,2005,2005,2005,2005,2006,2006,2006,2006,2007,2007,2007,2007,2008,2008,2008,2008,2009,2009,2009,2009,2010,2010,2010,2010,2011,2011,2011,2011,2012,2012,2012,2012,2013,2013,2013,2013,2014,2014,2014,2014,2015,2015,2015,2015,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018),\n                 fqtr=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2),\n                 revtq=c(7.267,9.939,11.47,15.61,12.758,18.207,19.205,27.844,21.011,29.918,32.683,41.277,32.737,37.655,42.098,55.071,46.355,56.177,59.462,74.215,64.65,78.852,86.108,110.721,93.971,115.186,121.875,147.775,122.655,152.381,172.637,230.783,167.274,204.085,221.982,306.957,238.569,291.685,300.747,417.175,314.575,372.598,401.842,554.184,434.251,533.653,634.476,842.617,665.229,788.705,827.767,1094.551,854.734,1098.942,1166.594,1546.639,1234.768,1508.533,1583.573,2073.983,1655.661,1934.358,2087.533,2773.936,2344.253,2770.276,2949.008,3845.557,3211.761,3732.237,4015.017,5000.238,4282.012,4845.133,4991.113,6530.734,5373.258,6046.41,6283.496,8107.484,6768.191,7543.508,7930.949,10358.898,9280.566,10339.898,10627.5,13638.797,11649.398,13028.398,13683.797,17122,13920.398,16236.5,16826.898,20360.5,17686.098,19942.297,20417.699,24447.797,20440,22723,22914,27550,22772,25587,25644,30856,25409,28386,28777,35386,29819,33521,33509,40785,35129,38913,40899,51868,43447,46588,46181,57079,48565,53187,53185,64735,52126,56781,55765,66905,57224,63231,63036,75190,65434,70516,69282,82900,70755,76697,75397,89252,79675,85430,84467,99078,86378,92827,91794,107289,94940,102342,98345,108747,94214,100876,99373,113622,99811,103726,101952,116360,104189,109366,110226,123169,113010,114065,113800,127776,114070,116830,115688,129706,114960,120125,119001,131565,114826,120229,117408,129667,115904,120319,118179,130936,117542,122968,123179,136267,122690,128028))\ndf <- df %>% mutate(revtq_lag=lag(revtq))\n\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\nmod1 <- lm(revtq ~ revtq_lag, data=training)","sample":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\n# training and mod1 from Exercise 1 are also loaded\ntraining[1:5,]\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Plot out the actual and predicted values as a scatterplot using ggplot2\n# Predicted values can be calculated using `predict(mod1, training)`\n# Use the ggplot() and geom_point() functions to plot\n# Use factor to specify color by quarter (fqtr)\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\n# I recommend adding `geom_abline(slope=1)` to the plot as well to see where perfect accuracy would be\nggplot(data=, aes(y=, x=, color=)) + \n  geom_point()\n\n#END","solution":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\n# training and mod1 from Exercise 1 are also loaded\ntraining[1:5,]\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Plot out the actual and predicted values as a scatterplot using ggplot2\n# Predicted values can be calculated using `predict(mod1, training)`\n# Use the ggplot() and geom_point() functions to plot\n# Use factor to specify color by quarter (fqtr)\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\n# I recommend adding `geom_abline(slope=1)` to the plot as well to see where perfect accuracy would be\nggplot(data=training, aes(y=revtq,x=predict(mod1, training), color=factor(fqtr))) + \n  geom_point() + \n  geom_abline(slope=1) + \n  xlab(\"Predicted revenue\") + \n  ylab(\"Actual revenue\")\n\n#END","sct":"# Template based on https://www.rdocumentation.org/packages/testwhat/versions/4.1.1\n# Check if something is explicitly typed\n\ntest_student_typed('y=revtq',not_typed_msg='Make sure to set `y=` to be `revtq`')\ntest_student_typed('x=predict(mod1, training)',not_typed_msg='Make sure to set `x=` to be `predict(mod1, training)`.')\ntest_student_typed('color=factor(fqtr)',not_typed_msg='You should put `factor()` in your expression for color, so as to get more distinct colors (as opposed to a color scale with `color=fqtr`.')\ntest_student_typed('geom_point()',not_typed_msg='Add `geom_point()` to your `ggplot() call to make a scatter plot.')\n\n# test_student_typed('x <- 2', not_typed_msg='')\n\n# Check if function was used in input code\n# test_function('c',incorrect_msg='')  \n\n# Requires an object `x` to have the same value as the solution\n# test_object(\"x\",incorrect_msg = \"\",undefined_msg = \"\")  \n\n# Requires an onject with the same value of `x` in the solution\n# test_an_object(\"x\",undefined_msg=\"\")\n\n# Checks if output of student's code contains given evaluated expression\n# test_output_contains(\"x\",incorrect_msg = \"\")\n\n# Check if a vector of predefined objects are unchanged\n# test_predefined_objects(c('x','y'),incorrect_msg=\"Don't onverwrite the predefined variables\")\n\n# Checks for a regex pattern in trhe output\n# test_output_regex(pattern,fixed=F, times=1, incorrect_msg='')\n\n# Can check an arbitrary expression across both solution and student code\n#test_expression_output(\"typeof(company_name)\", incorrect_msg=\"Did you store textual data in `company_name`?\")\n\ntest_error()\nsuccess_msg(\"Awesome!\")\n\n# Other functions to note:\n#     - test_or(a,b) -- checks if either test a or test b pass\n#     - test_ggplot() -- can check if plots are correct\n#     - test_function() -- can also check included parameters\n#     - test_loop() -- checking for and while loops\n#     - test_library_function('package', not_called_msg='',incorrect_msg='')\n#     - test_if_else() -- checking if statements\n#     - test_expression_error() -- can check if functions are properly defined\n#     - test_operator('operator',), not_called_msg='',incorrect_msg='')\n#     - test_function_definition() -- rigorously check defined function\n#     - test_data_frame() -- check if dataframe [columns] are equivalent\n#     - test_function_result, test_expression_result"}

Exercise 3: A better linear model

Reminder: interactions in a linear model can be done fully using * or partially using :.

{"language":"r","pre_exercise_code":"library(dplyr) # loads dplyr for data manipulation\ndf <- data.frame(tic=rep(\"WMT\",194),\n                 fyearq=c(1970,1970,1970,1970,1971,1971,1971,1971,1972,1972,1972,1972,1973,1973,1973,1973,1974,1974,1974,1974,1975,1975,1975,1975,1976,1976,1976,1976,1977,1977,1977,1977,1978,1978,1978,1978,1979,1979,1979,1979,1980,1980,1980,1980,1981,1981,1981,1981,1982,1982,1982,1982,1983,1983,1983,1983,1984,1984,1984,1984,1985,1985,1985,1985,1986,1986,1986,1986,1987,1987,1987,1987,1988,1988,1988,1988,1989,1989,1989,1989,1990,1990,1990,1990,1991,1991,1991,1991,1992,1992,1992,1992,1993,1993,1993,1993,1994,1994,1994,1994,1995,1995,1995,1995,1996,1996,1996,1996,1997,1997,1997,1997,1998,1998,1998,1998,1999,1999,1999,1999,2000,2000,2000,2000,2001,2001,2001,2001,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,2004,2004,2005,2005,2005,2005,2006,2006,2006,2006,2007,2007,2007,2007,2008,2008,2008,2008,2009,2009,2009,2009,2010,2010,2010,2010,2011,2011,2011,2011,2012,2012,2012,2012,2013,2013,2013,2013,2014,2014,2014,2014,2015,2015,2015,2015,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018),\n                 fqtr=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2),\n                 revtq=c(7.267,9.939,11.47,15.61,12.758,18.207,19.205,27.844,21.011,29.918,32.683,41.277,32.737,37.655,42.098,55.071,46.355,56.177,59.462,74.215,64.65,78.852,86.108,110.721,93.971,115.186,121.875,147.775,122.655,152.381,172.637,230.783,167.274,204.085,221.982,306.957,238.569,291.685,300.747,417.175,314.575,372.598,401.842,554.184,434.251,533.653,634.476,842.617,665.229,788.705,827.767,1094.551,854.734,1098.942,1166.594,1546.639,1234.768,1508.533,1583.573,2073.983,1655.661,1934.358,2087.533,2773.936,2344.253,2770.276,2949.008,3845.557,3211.761,3732.237,4015.017,5000.238,4282.012,4845.133,4991.113,6530.734,5373.258,6046.41,6283.496,8107.484,6768.191,7543.508,7930.949,10358.898,9280.566,10339.898,10627.5,13638.797,11649.398,13028.398,13683.797,17122,13920.398,16236.5,16826.898,20360.5,17686.098,19942.297,20417.699,24447.797,20440,22723,22914,27550,22772,25587,25644,30856,25409,28386,28777,35386,29819,33521,33509,40785,35129,38913,40899,51868,43447,46588,46181,57079,48565,53187,53185,64735,52126,56781,55765,66905,57224,63231,63036,75190,65434,70516,69282,82900,70755,76697,75397,89252,79675,85430,84467,99078,86378,92827,91794,107289,94940,102342,98345,108747,94214,100876,99373,113622,99811,103726,101952,116360,104189,109366,110226,123169,113010,114065,113800,127776,114070,116830,115688,129706,114960,120125,119001,131565,114826,120229,117408,129667,115904,120319,118179,130936,117542,122968,123179,136267,122690,128028))\ndf <- df %>% mutate(revtq_lag=lag(revtq))","sample":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n\n# Split the data into training and testing data\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\n# Calculate a linear model of revenue on lag revenue partially interacted with quarter, store it in mod2\n# Using the training data only\nmod2 <- lm( ~ , data=training)\n\n# Print out a summary of the model using summary\nsummary(mod2)\n\n#END","solution":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n\n# Split the data into training and testing data\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\n# Calculate a linear model of revenue on lag revenue partially interacted with quarter, store it in mod2\n# Using the training data only\nmod2 <- lm(revtq ~ revtq_lag:factor(fqtr), data=training)\n\n# Print out a summary of the model using summary\nsummary(mod2)\n\n#END","sct":"# Template based on https://www.rdocumentation.org/packages/testwhat/versions/4.1.1\n# Check if something is explicitly typed\n\ntest_student_typed('revtq ~ ',not_typed_msg='Make sure to have `revtq` as your dependent variable.')\ntest_student_typed(':', not_typed_msg=\"Make sure to interact `revtq_lag` with `factor(fqtr)` using `:`.\")\ntest_student_typed('factor', not_typed_msg=\"Make sure to use factor to turn fqtr into a factor variable.\")\ntest_expression_output(\"unname(mod2$coefficients)\", incorrect_msg=\"Your regression model isn't quite right.\")\ntest_student_typed('summary(mod2)',not_typed_msg='Your model is correct, but did you remember to summarize it with `summary()`?')\n\n# test_student_typed('x <- 2', not_typed_msg='')\n\n# Check if function was used in input code\n# test_function('c',incorrect_msg='')  \n\n# Requires an object `x` to have the same value as the solution\n# test_object(\"x\",incorrect_msg = \"\",undefined_msg = \"\")  \n\n# Requires an onject with the same value of `x` in the solution\n# test_an_object(\"x\",undefined_msg=\"\")\n\n# Checks if output of student's code contains given evaluated expression\n# test_output_contains(\"x\",incorrect_msg = \"\")\n\n# Check if a vector of predefined objects are unchanged\n# test_predefined_objects(c('x','y'),incorrect_msg=\"Don't onverwrite the predefined variables\")\n\n# Checks for a regex pattern in trhe output\n# test_output_regex(pattern,fixed=F, times=1, incorrect_msg='')\n\n# Can check an arbitrary expression across both solution and student code\n#test_expression_output(\"typeof(company_name)\", incorrect_msg=\"Did you store textual data in `company_name`?\")\n\ntest_error()\nsuccess_msg(\"Awesome!\")\n\n# Other functions to note:\n#     - test_or(a,b) -- checks if either test a or test b pass\n#     - test_ggplot() -- can check if plots are correct\n#     - test_function() -- can also check included parameters\n#     - test_loop() -- checking for and while loops\n#     - test_library_function('package', not_called_msg='',incorrect_msg='')\n#     - test_if_else() -- checking if statements\n#     - test_expression_error() -- can check if functions are properly defined\n#     - test_operator('operator',), not_called_msg='',incorrect_msg='')\n#     - test_function_definition() -- rigorously check defined function\n#     - test_data_frame() -- check if dataframe [columns] are equivalent\n#     - test_function_result, test_expression_result"}

Exercise 4: Plotting the better linear model

Note: it appears that Datacamp-light can’t test plot accuracy at the time of writing this. As such, please manually check the solution once you think you know the answer, or if you get stuck.

Notice how this code is identical to exercise 3, except with the different model name.

{"language":"r","pre_exercise_code":"library(dplyr) # loads dplyr for data manipulation\ndf <- data.frame(tic=rep(\"WMT\",194),\n                 fyearq=c(1970,1970,1970,1970,1971,1971,1971,1971,1972,1972,1972,1972,1973,1973,1973,1973,1974,1974,1974,1974,1975,1975,1975,1975,1976,1976,1976,1976,1977,1977,1977,1977,1978,1978,1978,1978,1979,1979,1979,1979,1980,1980,1980,1980,1981,1981,1981,1981,1982,1982,1982,1982,1983,1983,1983,1983,1984,1984,1984,1984,1985,1985,1985,1985,1986,1986,1986,1986,1987,1987,1987,1987,1988,1988,1988,1988,1989,1989,1989,1989,1990,1990,1990,1990,1991,1991,1991,1991,1992,1992,1992,1992,1993,1993,1993,1993,1994,1994,1994,1994,1995,1995,1995,1995,1996,1996,1996,1996,1997,1997,1997,1997,1998,1998,1998,1998,1999,1999,1999,1999,2000,2000,2000,2000,2001,2001,2001,2001,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,2004,2004,2005,2005,2005,2005,2006,2006,2006,2006,2007,2007,2007,2007,2008,2008,2008,2008,2009,2009,2009,2009,2010,2010,2010,2010,2011,2011,2011,2011,2012,2012,2012,2012,2013,2013,2013,2013,2014,2014,2014,2014,2015,2015,2015,2015,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018),\n                 fqtr=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2),\n                 revtq=c(7.267,9.939,11.47,15.61,12.758,18.207,19.205,27.844,21.011,29.918,32.683,41.277,32.737,37.655,42.098,55.071,46.355,56.177,59.462,74.215,64.65,78.852,86.108,110.721,93.971,115.186,121.875,147.775,122.655,152.381,172.637,230.783,167.274,204.085,221.982,306.957,238.569,291.685,300.747,417.175,314.575,372.598,401.842,554.184,434.251,533.653,634.476,842.617,665.229,788.705,827.767,1094.551,854.734,1098.942,1166.594,1546.639,1234.768,1508.533,1583.573,2073.983,1655.661,1934.358,2087.533,2773.936,2344.253,2770.276,2949.008,3845.557,3211.761,3732.237,4015.017,5000.238,4282.012,4845.133,4991.113,6530.734,5373.258,6046.41,6283.496,8107.484,6768.191,7543.508,7930.949,10358.898,9280.566,10339.898,10627.5,13638.797,11649.398,13028.398,13683.797,17122,13920.398,16236.5,16826.898,20360.5,17686.098,19942.297,20417.699,24447.797,20440,22723,22914,27550,22772,25587,25644,30856,25409,28386,28777,35386,29819,33521,33509,40785,35129,38913,40899,51868,43447,46588,46181,57079,48565,53187,53185,64735,52126,56781,55765,66905,57224,63231,63036,75190,65434,70516,69282,82900,70755,76697,75397,89252,79675,85430,84467,99078,86378,92827,91794,107289,94940,102342,98345,108747,94214,100876,99373,113622,99811,103726,101952,116360,104189,109366,110226,123169,113010,114065,113800,127776,114070,116830,115688,129706,114960,120125,119001,131565,114826,120229,117408,129667,115904,120319,118179,130936,117542,122968,123179,136267,122690,128028))\ndf <- df %>% mutate(revtq_lag=lag(revtq))\n\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\nmod1 <- lm(revtq ~ revtq_lag, data=training)\nmod2 <- lm(revtq ~ revtq_lag:factor(fqtr), data=training)","sample":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n# mod2 and training from Exercise 3 are also loaded\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Plot out the actual and predicted values as a scatterplot using ggplot2\n# Predicted values can be calculated using `predict(mod2, training)`\n# Use the ggplot() and geom_point() functions to plot\n# Use factor to specify color by quarter (fqtr)\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\n# I recommend adding `geom_abline(slope=1)` to the plot as well to see where perfect accuracy would be\nggplot(data=training, aes(y=revtq,x=predict(mod2, training), color=factor(fqtr))) + \n  geom_point() + \n  geom_abline(slope=1) + \n  xlab(\"Predicted revenue\") + \n  ylab(\"Actual revenue\")\n\n#END","solution":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ndf[1:5,]\n# mod2 and training from Exercise 3 are also loaded\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Plot out the actual and predicted values as a scatterplot using ggplot2\n# Predicted values can be calculated using `predict(mod2, training)`\n# Use the ggplot() and geom_point() functions to plot\n# Use factor to specify color by quarter (fqtr)\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\n# I recommend adding `geom_abline(slope=1)` to the plot as well to see where perfect accuracy would be\nggplot(data=training, aes(y=revtq,x=predict(mod2, training), color=factor(fqtr))) + \n  geom_point() + \n  geom_abline(slope=1) + \n  xlab(\"Predicted revenue\") + \n  ylab(\"Actual revenue\")\n\n#END","sct":"# Template based on https://www.rdocumentation.org/packages/testwhat/versions/4.1.1\n# Check if something is explicitly typed\n\ntest_student_typed('y=revtq',not_typed_msg='Make sure to set `y=` to be `revtq`')\ntest_student_typed('x=predict(mod2, training)',not_typed_msg='Make sure to set `x=` to be `predict(mod2, training)`.')\ntest_student_typed('color=factor(fqtr)',not_typed_msg='You should put `factor()` in your expression for color, so as to get more distinct colors (as opposed to a color scale with `color=fqtr`.')\ntest_student_typed('geom_point()',not_typed_msg='Add `geom_point()` to your `ggplot() call to make a scatter plot.')\n\n# test_student_typed('x <- 2', not_typed_msg='')\n\n# Check if function was used in input code\n# test_function('c',incorrect_msg='')  \n\n# Requires an object `x` to have the same value as the solution\n# test_object(\"x\",incorrect_msg = \"\",undefined_msg = \"\")  \n\n# Requires an onject with the same value of `x` in the solution\n# test_an_object(\"x\",undefined_msg=\"\")\n\n# Checks if output of student's code contains given evaluated expression\n# test_output_contains(\"x\",incorrect_msg = \"\")\n\n# Check if a vector of predefined objects are unchanged\n# test_predefined_objects(c('x','y'),incorrect_msg=\"Don't onverwrite the predefined variables\")\n\n# Checks for a regex pattern in trhe output\n# test_output_regex(pattern,fixed=F, times=1, incorrect_msg='')\n\n# Can check an arbitrary expression across both solution and student code\n#test_expression_output(\"typeof(company_name)\", incorrect_msg=\"Did you store textual data in `company_name`?\")\n\ntest_error()\nsuccess_msg(\"Awesome!\")\n\n# Other functions to note:\n#     - test_or(a,b) -- checks if either test a or test b pass\n#     - test_ggplot() -- can check if plots are correct\n#     - test_function() -- can also check included parameters\n#     - test_loop() -- checking for and while loops\n#     - test_library_function('package', not_called_msg='',incorrect_msg='')\n#     - test_if_else() -- checking if statements\n#     - test_expression_error() -- can check if functions are properly defined\n#     - test_operator('operator',), not_called_msg='',incorrect_msg='')\n#     - test_function_definition() -- rigorously check defined function\n#     - test_data_frame() -- check if dataframe [columns] are equivalent\n#     - test_function_result, test_expression_result"}

Exercise 5: Comparing the models with a line plot

Note: it appears that Datacamp-light can’t test plot accuracy at the time of writing this. As such, please manually check the solution once you think you know the answer, or if you get stuck.

{"language":"r","pre_exercise_code":"library(dplyr) # loads dplyr for data manipulation\ndf <- data.frame(tic=rep(\"WMT\",194),\n                 fyearq=c(1970,1970,1970,1970,1971,1971,1971,1971,1972,1972,1972,1972,1973,1973,1973,1973,1974,1974,1974,1974,1975,1975,1975,1975,1976,1976,1976,1976,1977,1977,1977,1977,1978,1978,1978,1978,1979,1979,1979,1979,1980,1980,1980,1980,1981,1981,1981,1981,1982,1982,1982,1982,1983,1983,1983,1983,1984,1984,1984,1984,1985,1985,1985,1985,1986,1986,1986,1986,1987,1987,1987,1987,1988,1988,1988,1988,1989,1989,1989,1989,1990,1990,1990,1990,1991,1991,1991,1991,1992,1992,1992,1992,1993,1993,1993,1993,1994,1994,1994,1994,1995,1995,1995,1995,1996,1996,1996,1996,1997,1997,1997,1997,1998,1998,1998,1998,1999,1999,1999,1999,2000,2000,2000,2000,2001,2001,2001,2001,2002,2002,2002,2002,2003,2003,2003,2003,2004,2004,2004,2004,2005,2005,2005,2005,2006,2006,2006,2006,2007,2007,2007,2007,2008,2008,2008,2008,2009,2009,2009,2009,2010,2010,2010,2010,2011,2011,2011,2011,2012,2012,2012,2012,2013,2013,2013,2013,2014,2014,2014,2014,2015,2015,2015,2015,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018),\n                 fqtr=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2),\n                 revtq=c(7.267,9.939,11.47,15.61,12.758,18.207,19.205,27.844,21.011,29.918,32.683,41.277,32.737,37.655,42.098,55.071,46.355,56.177,59.462,74.215,64.65,78.852,86.108,110.721,93.971,115.186,121.875,147.775,122.655,152.381,172.637,230.783,167.274,204.085,221.982,306.957,238.569,291.685,300.747,417.175,314.575,372.598,401.842,554.184,434.251,533.653,634.476,842.617,665.229,788.705,827.767,1094.551,854.734,1098.942,1166.594,1546.639,1234.768,1508.533,1583.573,2073.983,1655.661,1934.358,2087.533,2773.936,2344.253,2770.276,2949.008,3845.557,3211.761,3732.237,4015.017,5000.238,4282.012,4845.133,4991.113,6530.734,5373.258,6046.41,6283.496,8107.484,6768.191,7543.508,7930.949,10358.898,9280.566,10339.898,10627.5,13638.797,11649.398,13028.398,13683.797,17122,13920.398,16236.5,16826.898,20360.5,17686.098,19942.297,20417.699,24447.797,20440,22723,22914,27550,22772,25587,25644,30856,25409,28386,28777,35386,29819,33521,33509,40785,35129,38913,40899,51868,43447,46588,46181,57079,48565,53187,53185,64735,52126,56781,55765,66905,57224,63231,63036,75190,65434,70516,69282,82900,70755,76697,75397,89252,79675,85430,84467,99078,86378,92827,91794,107289,94940,102342,98345,108747,94214,100876,99373,113622,99811,103726,101952,116360,104189,109366,110226,123169,113010,114065,113800,127776,114070,116830,115688,129706,114960,120125,119001,131565,114826,120229,117408,129667,115904,120319,118179,130936,117542,122968,123179,136267,122690,128028))\ndf <- df %>% mutate(revtq_lag=lag(revtq))\n\ntraining <- filter(df, fyearq < 2015)\ntesting <- filter(df, fyearq >= 2015)\n\nmod1 <- lm(revtq ~ revtq_lag, data=training)\nmod2 <- lm(revtq ~ revtq_lag:factor(fqtr), data=training)","sample":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ntesting[1:5,]\n# mod1, mod2, and testing are also loaded\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Make a better index for dates\ntesting$quarter <- testing$fyearq + (testing$fqtr - 1) / 4\n\n# Plot out 3 lines:\n#     1) `revtq` vs `quarter`\n#     2) predicted using mod1 vs `quarter`\n#     2) predicted using mod2 vs `quarter`\n# Predicted values can be calculated using `predict(mod1, testing)` and `predict(mod2, testing)`\n# Use the ggplot() and geom_line() functions to plot\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\nggplot(data=testing) + \n  geom_line(aes(y=, x=, color=\"Actual\")) +\n  geom_line(aes(y=, x=, color=\"Simple model\")) +\n  geom_line(aes(y=, x=, color=\"Advanced model\")) + \n  xlab(\"Quarter\") + \n  ylab(\"Walmart's revenue\")\n#END","solution":"# The dataframe below contains historical revenue for Walmart since 1970\n# The file uses Compustat's nomenclature:\n#     - revenue is revtq\n#     - ticker is tic\n#     - fyearq is year\n#     - fqtr is quarter\n# It also contains the lag of revenue, revtq_lag\ntesting[1:5,]\n# mod1, mod2, and testing are also loaded\n\n# Load ggplot2 for plotting\nlibrary(ggplot2)\n\n# Make a better index for dates\ntesting$quarter <- testing$fyearq + (testing$fqtr - 1) / 4\n\n# Plot out 3 lines:\n#     1) `revtq` vs `quarter`\n#     2) predicted using mod1 vs `quarter`\n#     2) predicted using mod2 vs `quarter`\n# Predicted values can be calculated using `predict(mod1, testing)` and `predict(mod2, testing)`\n# Use the ggplot() and geom_line() functions to plot\n# To make the graph nicer looking, you can use the xlab() and ylab() function to label the axes\nggplot(data=testing) + \n  geom_line(aes(y=revtq, x=quarter, color=\"Actual\")) +\n  geom_line(aes(y=predict(mod1, testing), x=quarter, color=\"Simple model\")) +\n  geom_line(aes(y=predict(mod2, testing), x=quarter, color=\"Advanced model\")) + \n  xlab(\"Quarter\") + \n  ylab(\"Walmart's revenue\")\n#END","sct":"# Template based on https://www.rdocumentation.org/packages/testwhat/versions/4.1.1\n# Check if something is explicitly typed\n\ntest_student_typed('y=revtq',not_typed_msg='Make sure to set `y=` to be `revtq` for a line')\ntest_student_typed('y=predict(mod1, training)',not_typed_msg='Make sure to set `y=` to be `predict(mod1, training)` for a line.')\ntest_student_typed('y=predict(mod2, training)',not_typed_msg='Make sure to set `y=` to be `predict(mod2, training)` for a line.')\ntest_student_typed('x=quarter',not_typed_msg='Make sure to set `x=` to be `quarter` for all lines.')\ntest_student_typed('geom_line()',not_typed_msg='Use the provided `geom_line()` calls to make a line chart.')\n\n# test_student_typed('x <- 2', not_typed_msg='')\n\n# Check if function was used in input code\n# test_function('c',incorrect_msg='')  \n\n# Requires an object `x` to have the same value as the solution\n# test_object(\"x\",incorrect_msg = \"\",undefined_msg = \"\")  \n\n# Requires an onject with the same value of `x` in the solution\n# test_an_object(\"x\",undefined_msg=\"\")\n\n# Checks if output of student's code contains given evaluated expression\n# test_output_contains(\"x\",incorrect_msg = \"\")\n\n# Check if a vector of predefined objects are unchanged\n# test_predefined_objects(c('x','y'),incorrect_msg=\"Don't onverwrite the predefined variables\")\n\n# Checks for a regex pattern in trhe output\n# test_output_regex(pattern,fixed=F, times=1, incorrect_msg='')\n\n# Can check an arbitrary expression across both solution and student code\n#test_expression_output(\"typeof(company_name)\", incorrect_msg=\"Did you store textual data in `company_name`?\")\n\ntest_error()\nsuccess_msg(\"Awesome!\")\n\n# Other functions to note:\n#     - test_or(a,b) -- checks if either test a or test b pass\n#     - test_ggplot() -- can check if plots are correct\n#     - test_function() -- can also check included parameters\n#     - test_loop() -- checking for and while loops\n#     - test_library_function('package', not_called_msg='',incorrect_msg='')\n#     - test_if_else() -- checking if statements\n#     - test_expression_error() -- can check if functions are properly defined\n#     - test_operator('operator',), not_called_msg='',incorrect_msg='')\n#     - test_function_definition() -- rigorously check defined function\n#     - test_data_frame() -- check if dataframe [columns] are equivalent\n#     - test_function_result, test_expression_result"}