## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Package version: 1.5.2
## Parallel computing: 2 of 2 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View

Explore Data

df <- read.csv("https://query.data.world/s/rr46ndg7fyne54q7oonmvzxbaxg3zn", header = TRUE, 
    stringsAsFactors = FALSE)
colnames(df) <- tolower(colnames(df))
dat <- df[c("title", "genre", "plot")]
head(dat) %>% pander()
Table continues below
title genre
Code Name: K.O.Z. Crime, Mystery
Saving Christmas Comedy, Family
Superbabies: Baby Geniuses 2 Comedy, Family, Sci-Fi
Daniel der Zauberer Comedy, Crime, Fantasy
Manos: The Hands of Fate Horror
Pledge This! Comedy
plot
A look at the 17-25 December 2013 corruption scandal in Turkey, from the viewpoint of the Erdogan government.
Kirk is enjoying the annual Christmas party extravaganza thrown by his sister until he realizes he needs to help out Christian, his brother-in-law, who has a bad case of the bah-humbugs. …
A group of smart-talking toddlers find themselves at the center of a media mogul’s experiment to crack the code to baby talk. The toddlers must race against time for the sake of babies everywhere.
Evil assassins want to kill Daniel Kublbock, the third runner up for the German Idols.
A family gets lost on the road and stumbles upon a hidden, underground, devil-worshiping cult led by the fearsome Master and his servant Torgo.
At South Beach University, a beautiful sorority president takes in a group of unconventional freshman girls seeking acceptance into her house.
grep(pattern = "romance", x = dat$genre, value = TRUE, ignore.case = TRUE) %>% head() %>% 
    pander()

Horror, Romance, Thriller, Comedy, Romance, Sport, Comedy, Romance, Comedy, Musical, Romance, Drama, Romance, Thriller and Drama, Music, Romance

romance <- grepl("romance", dat$genre, ignore.case = T)
sum(romance)
[1] 927
dat.romance <- dat[romance, c("title", "genre", "plot")]
dat.romance %>% head(10) %>% pander()
Table continues below
  title genre
9 Birdemic: Shock and Terror Horror, Romance, Thriller
10 Dream.net Comedy, Romance, Sport
14 The Hottie & the Nottie Comedy, Romance
16 From Justin to Kelly Comedy, Musical, Romance
23 Ben & Arthur Drama, Romance, Thriller
32 Glitter Drama, Music, Romance
36 Space Mutiny Action, Adventure, Romance
51 Gigli Comedy, Crime, Romance
80 A Story About Love Romance
97 The Bat People Horror, Romance
  plot
9 A horde of mutated birds descends upon the quiet town of Half Moon Bay, California. With the death toll rising, Two citizens manage to fight back, but will they survive Birdemic?
10 Regina, the once popular girl has to make new friends at her new, conservative school. Problems arrive when she becomes enemies with Lívia, the school’s queen bee, and falls in love with …
14 A woman agrees to go on a date with a man only if he finds a suitor for her unattractive best friend.
16 A waitress from Texas and a college student from Pennsylvania meet during spring break in Fort Lauderdale, Florida and come together through their shared love of singing.
23 A pair of recently married gay men are threatened by one of the partners’ brother, a religious fanatic who plots to murder them after being ostracized by his church.
32 A young singer dates a disc jockey who helps her get into the music business, but their relationship become complicated as she ascends to super stardom.
36 A pilot is the only hope to stop the mutiny of a spacecraft by its security crew, who plot to sell the crew of the ship into slavery.
51 The violent story about how a criminal lesbian, a tough-guy hit-man with a heart of gold, and a mentally challenged man came to be best friends through a hostage.
80 Two young people stand on a street corner in a run-down part of New York, kissing. Despite the lawlessness of the district they are left unmolested. A short distance away walk Maria and …
97 After being bitten by a bat in a cave, a doctor undergoes an accelerating transformation into a man-bat, which ruins his vacation and causes considerable distress for his wife.
corp.romance <- corpus(dat.romance, docid_field = "title", text_field = "plot")
corp.romance
Corpus consisting of 927 documents and 1 docvar.
corp.romance[1:5]
                                                                                                                                                                      Birdemic: Shock and Terror 
            "A horde of mutated birds descends upon the quiet town of Half Moon Bay, California. With the death toll rising, Two citizens manage to fight back, but will they survive Birdemic?" 
                                                                                                                                                                                       Dream.net 
"Regina, the once popular girl has to make new friends at her new, conservative school. Problems arrive when she becomes enemies with Lívia, the school's queen bee, and falls in love with ..." 
                                                                                                                                                                         The Hottie & the Nottie 
                                                                                         "A woman agrees to go on a date with a man only if he finds a suitor for her unattractive best friend." 
                                                                                                                                                                            From Justin to Kelly 
                    "A waitress from Texas and a college student from Pennsylvania meet during spring break in Fort Lauderdale, Florida and come together through their shared love of singing." 
                                                                                                                                                                                    Ben & Arthur 
                         "A pair of recently married gay men are threatened by one of the partners' brother, a religious fanatic who plots to murder them after being ostracized by his church." 
# summarize corpus
summary(corp.romance)[1:10, ]
                         Text Types Tokens Sentences                      genre
1  Birdemic: Shock and Terror    32     36         2  Horror, Romance, Thriller
2                   Dream.net    31     40         2     Comedy, Romance, Sport
3     The Hottie & the Nottie    21     23         1            Comedy, Romance
4        From Justin to Kelly    27     29         1   Comedy, Musical, Romance
5                Ben & Arthur    30     32         1   Drama, Romance, Thriller
6                     Glitter    28     28         1      Drama, Music, Romance
7                Space Mutiny    24     30         1 Action, Adventure, Romance
8                       Gigli    27     32         1     Comedy, Crime, Romance
9          A Story About Love    32     39         3                    Romance
10             The Bat People    27     32         1            Horror, Romance
# pre-processing steps:

# remove mission statements that are less than 1 sentence long
corp.romance <- corpus_trim(corp.romance, what = "sentences", min_ntoken = 1)
corp.romance
Corpus consisting of 927 documents and 1 docvar.
# remove punctuation
tokens.romance <- tokens(corp.romance, what = "word", remove_punct = TRUE)
head(tokens.romance)
tokens from 6 documents.
Birdemic: Shock and Terror :
 [1] "A"          "horde"      "of"         "mutated"    "birds"     
 [6] "descends"   "upon"       "the"        "quiet"      "town"      
[11] "of"         "Half"       "Moon"       "Bay"        "California"
[16] "With"       "the"        "death"      "toll"       "rising"    
[21] "Two"        "citizens"   "manage"     "to"         "fight"     
[26] "back"       "but"        "will"       "they"       "survive"   
[31] "Birdemic"  

Dream.net :
 [1] "Regina"       "the"          "once"         "popular"      "girl"        
 [6] "has"          "to"           "make"         "new"          "friends"     
[11] "at"           "her"          "new"          "conservative" "school"      
[16] "Problems"     "arrive"       "when"         "she"          "becomes"     
[21] "enemies"      "with"         "Lívia"        "the"          "school's"    
[26] "queen"        "bee"          "and"          "falls"        "in"          
[31] "love"         "with"        

The Hottie & the Nottie :
 [1] "A"            "woman"        "agrees"       "to"           "go"          
 [6] "on"           "a"            "date"         "with"         "a"           
[11] "man"          "only"         "if"           "he"           "finds"       
[16] "a"            "suitor"       "for"          "her"          "unattractive"
[21] "best"         "friend"      

From Justin to Kelly :
 [1] "A"            "waitress"     "from"         "Texas"        "and"         
 [6] "a"            "college"      "student"      "from"         "Pennsylvania"
[11] "meet"         "during"       "spring"       "break"        "in"          
[16] "Fort"         "Lauderdale"   "Florida"      "and"          "come"        
[21] "together"     "through"      "their"        "shared"       "love"        
[26] "of"           "singing"     

Ben & Arthur :
 [1] "A"          "pair"       "of"         "recently"   "married"   
 [6] "gay"        "men"        "are"        "threatened" "by"        
[11] "one"        "of"         "the"        "partners"   "brother"   
[16] "a"          "religious"  "fanatic"    "who"        "plots"     
[21] "to"         "murder"     "them"       "after"      "being"     
[26] "ostracized" "by"         "his"        "church"    

Glitter :
 [1] "A"            "young"        "singer"       "dates"        "a"           
 [6] "disc"         "jockey"       "who"          "helps"        "her"         
[11] "get"          "into"         "the"          "music"        "business"    
[16] "but"          "their"        "relationship" "become"       "complicated" 
[21] "as"           "she"          "ascends"      "to"           "super"       
[26] "stardom"     
# convert to lower case
tokens.romance <- tokens_tolower(tokens.romance, keep_acronyms = TRUE)
head(tokens.romance)
tokens from 6 documents.
Birdemic: Shock and Terror :
 [1] "a"          "horde"      "of"         "mutated"    "birds"     
 [6] "descends"   "upon"       "the"        "quiet"      "town"      
[11] "of"         "half"       "moon"       "bay"        "california"
[16] "with"       "the"        "death"      "toll"       "rising"    
[21] "two"        "citizens"   "manage"     "to"         "fight"     
[26] "back"       "but"        "will"       "they"       "survive"   
[31] "birdemic"  

Dream.net :
 [1] "regina"       "the"          "once"         "popular"      "girl"        
 [6] "has"          "to"           "make"         "new"          "friends"     
[11] "at"           "her"          "new"          "conservative" "school"      
[16] "problems"     "arrive"       "when"         "she"          "becomes"     
[21] "enemies"      "with"         "lívia"        "the"          "school's"    
[26] "queen"        "bee"          "and"          "falls"        "in"          
[31] "love"         "with"        

The Hottie & the Nottie :
 [1] "a"            "woman"        "agrees"       "to"           "go"          
 [6] "on"           "a"            "date"         "with"         "a"           
[11] "man"          "only"         "if"           "he"           "finds"       
[16] "a"            "suitor"       "for"          "her"          "unattractive"
[21] "best"         "friend"      

From Justin to Kelly :
 [1] "a"            "waitress"     "from"         "texas"        "and"         
 [6] "a"            "college"      "student"      "from"         "pennsylvania"
[11] "meet"         "during"       "spring"       "break"        "in"          
[16] "fort"         "lauderdale"   "florida"      "and"          "come"        
[21] "together"     "through"      "their"        "shared"       "love"        
[26] "of"           "singing"     

Ben & Arthur :
 [1] "a"          "pair"       "of"         "recently"   "married"   
 [6] "gay"        "men"        "are"        "threatened" "by"        
[11] "one"        "of"         "the"        "partners"   "brother"   
[16] "a"          "religious"  "fanatic"    "who"        "plots"     
[21] "to"         "murder"     "them"       "after"      "being"     
[26] "ostracized" "by"         "his"        "church"    

Glitter :
 [1] "a"            "young"        "singer"       "dates"        "a"           
 [6] "disc"         "jockey"       "who"          "helps"        "her"         
[11] "get"          "into"         "the"          "music"        "business"    
[16] "but"          "their"        "relationship" "become"       "complicated" 
[21] "as"           "she"          "ascends"      "to"           "super"       
[26] "stardom"     
tokens.romance <- tokens_remove(tokens.romance, c(stopwords("english"), "nbsp"), 
    padding = F)
head(tokens.romance)
tokens from 6 documents.
Birdemic: Shock and Terror :
 [1] "horde"      "mutated"    "birds"      "descends"   "upon"      
 [6] "quiet"      "town"       "half"       "moon"       "bay"       
[11] "california" "death"      "toll"       "rising"     "two"       
[16] "citizens"   "manage"     "fight"      "back"       "survive"   
[21] "birdemic"  

Dream.net :
 [1] "regina"       "popular"      "girl"         "make"         "new"         
 [6] "friends"      "new"          "conservative" "school"       "problems"    
[11] "arrive"       "becomes"      "enemies"      "lívia"        "school's"    
[16] "queen"        "bee"          "falls"        "love"        

The Hottie & the Nottie :
 [1] "woman"        "agrees"       "go"           "date"         "man"         
 [6] "finds"        "suitor"       "unattractive" "best"         "friend"      

From Justin to Kelly :
 [1] "waitress"     "texas"        "college"      "student"      "pennsylvania"
 [6] "meet"         "spring"       "break"        "fort"         "lauderdale"  
[11] "florida"      "come"         "together"     "shared"       "love"        
[16] "singing"     

Ben & Arthur :
 [1] "pair"       "recently"   "married"    "gay"        "men"       
 [6] "threatened" "one"        "partners"   "brother"    "religious" 
[11] "fanatic"    "plots"      "murder"     "ostracized" "church"    

Glitter :
 [1] "young"        "singer"       "dates"        "disc"         "jockey"      
 [6] "helps"        "get"          "music"        "business"     "relationship"
[11] "become"       "complicated"  "ascends"      "super"        "stardom"     
# stem the words in the token list:
tokens.romance <- tokens_wordstem(tokens.romance)
head(tokens.romance)
tokens from 6 documents.
Birdemic: Shock and Terror :
 [1] "hord"       "mutat"      "bird"       "descend"    "upon"      
 [6] "quiet"      "town"       "half"       "moon"       "bay"       
[11] "california" "death"      "toll"       "rise"       "two"       
[16] "citizen"    "manag"      "fight"      "back"       "surviv"    
[21] "birdem"    

Dream.net :
 [1] "regina"  "popular" "girl"    "make"    "new"     "friend"  "new"    
 [8] "conserv" "school"  "problem" "arriv"   "becom"   "enemi"   "lívia"  
[15] "school"  "queen"   "bee"     "fall"    "love"   

The Hottie & the Nottie :
 [1] "woman"     "agre"      "go"        "date"      "man"       "find"     
 [7] "suitor"    "unattract" "best"      "friend"   

From Justin to Kelly :
 [1] "waitress"     "texa"         "colleg"       "student"      "pennsylvania"
 [6] "meet"         "spring"       "break"        "fort"         "lauderdal"   
[11] "florida"      "come"         "togeth"       "share"        "love"        
[16] "sing"        

Ben & Arthur :
 [1] "pair"     "recent"   "marri"    "gay"      "men"      "threaten"
 [7] "one"      "partner"  "brother"  "religi"   "fanat"    "plot"    
[13] "murder"   "ostrac"   "church"  

Glitter :
 [1] "young"        "singer"       "date"         "disc"         "jockey"      
 [6] "help"         "get"          "music"        "busi"         "relationship"
[11] "becom"        "complic"      "ascend"       "super"        "stardom"     
# find frequently co-occuring words (typically compound words)
ngram.romance <- tokens_ngrams(tokens.romance, n = 2) %>% dfm()
ngram.romance %>% textstat_frequency(n = 10)
       feature frequency rank docfreq group
1    fall_love        57    1      57   all
2     new_york        37    2      36   all
3    young_man        32    3      32   all
4  high_school        28    4      27   all
5  young_woman        28    4      28   all
6  best_friend        27    6      26   all
7    world_war        14    7      14   all
8    york_citi        12    8      12   all
9    true_love        10    9      10   all
10      war_ii         9   10       9   all
ngram.romance3 <- tokens_ngrams(tokens.romance, n = 3) %>% dfm()
ngram.romance3 %>% textstat_frequency(n = 10)
             feature frequency rank docfreq group
1      new_york_citi        12    1      12   all
2       world_war_ii         9    2       9   all
3    two_best_friend         5    3       5   all
4    woman_fall_love         4    4       4   all
5    fall_love_woman         4    4       4   all
6    life_chang_meet         3    6       3   all
7   chang_live_forev         3    6       3   all
8  experi_chang_live         3    6       3   all
9   young_woman_find         3    6       3   all
10    meet_fall_love         3    6       3   all
tokens.romance %>% dfm(stem = T) %>% topfeatures()
  love  young  woman    man   life   fall   find friend    two    new 
   182    146    144    138    112     93     93     91     90     88 

Word