# # The function loads the "reuters" sample data, necessary for the XIV exercise session. # Note that it requires internet connection. # If you don't like it, download the reuters.txt file mentioned in line 19 manually, # put it in the working directory and change the read.table call to refer to the local copy. # Note that you can download the compressed version (reuters.txt.gz). In this case # you do not need to unpack the gz - for local files R handles unpacking automagically. # load.data = function() { # The following is a slightly-processed version of the Reuters dataset. Namely: # All articles with no TOPIC annotations are dropped # The text of each article is converted to lowercase, whitespace is normalized to single-spaces. # Only the first term from the TOPIC annotation list is retained (some articles have several topics assigned). # The resulting dataset is a list of pairs (Topic, News content) # # This is a fun dataset to play with, so I suggest you play with it beyond what we do in the exercise session. # reuters = read.table("https://courses.cs.ut.ee/2014/ml/uploads/Main/reuters.txt", header=T) # We leave only two topics here: Crude Oil and Grain-related news. reuters = reuters[reuters$Topic == "crude" | reuters$Topic == "grain",] # In addition, to avoid waiting for too long, we limit our experiments to just 500 elements (300 train + 200 test). set.seed(1) reuters = reuters[sample(1:nrow(reuters), 500),] # Fixup the columns reuters$Content = as.character(reuters$Content) # R originally loads this as factor. reuters$Topic = factor(reuters$Topic) # Re-level the factor to have only two levels reuters$y = 2*(reuters$Topic == "grain") - 1 reuters } # # The Kernel Perceptron algorithm. Requires some fixing before use. # kernel_perceptron = function(K, y) { alpha = rep(0, nrow(K)) b = 0 while (TRUE) { # Make predictions for all points in the training sample predictions = 0 # <----- FIXME # Find misclassified instances misclassified = which(sign(predictions) != y) # If no misclassified, we're done! if (length(misclassified) == 0) { break; } # If something is misclassified, pick the first element i = misclassified[1] # Update parameters alpha[i] = alpha[i] + 0 # <------- FIXME {Hint: in the usual perceptron this could be w := w + y[i]*x[i]} b = b + 0 # <------- FIXME {Hint: in the usual perceptron this could be b := b + y[i]} } # Return result result = list(alpha, b) names(result) = c("alpha", "b") result } circle.data = function() { angles = runif(300)*2*pi norms = c(runif(100), runif(100) + 2, runif(100) + 4) x = sin(angles)*norms y = cos(angles)*norms c = factor(c(rep(1,100), rep(2,100), rep(3,100))) data.frame(x=x, y=y, c=c) }