Unsupervised learning

M. Benesty

2017-09-15

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 18594  lr: 0.000000  loss: 2.666634  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##             [,1]       [,2]       [,3]         [,4]       [,5]        [,6]
## time   0.2646178 0.12198758 0.05387458 -0.001059301 0.05033693 0.009204047
## timing 0.2127367 0.03916844 0.01094351  0.066450834 0.01902333 0.014029122
##             [,7]        [,8]       [,9]       [,10]      [,11]
## time   0.1006525 -0.05165429 -0.1867511 -0.09149687 -0.2140105
## timing 0.1037399 -0.02082206 -0.1845313 -0.10030819 -0.2252076
##               [,12]      [,13]      [,14]     [,15]      [,16]      [,17]
## time   -0.002969536 0.04354328 0.02035887 0.1079958 0.13126923 -0.1849994
## timing  0.013844747 0.10126358 0.07640997 0.1328659 0.09676559 -0.1320701
##              [,18]       [,19]      [,20]      [,21]       [,22]
## time   -0.03653751 -0.06697740 0.11199731 -0.1125299 -0.08539391
## timing -0.00506349 -0.06391282 0.08757181 -0.1336615 -0.06963135
##             [,23]      [,24]       [,25]       [,26]     [,27]      [,28]
## time   0.09793303 0.06386217 -0.07443064 -0.09063132 0.3675116 -0.4145926
## timing 0.08923492 0.06158390 -0.05012904 -0.11666670 0.2799409 -0.4010045
##            [,29]     [,30]      [,31]        [,32]      [,33]     [,34]
## time   0.1636032 0.1763889 -0.1456650  0.009296339 -0.4786170 0.2489249
## timing 0.1788498 0.1866831 -0.1321961 -0.036398493 -0.5467163 0.3174041
##             [,35]      [,36]      [,37]       [,38]      [,39]      [,40]
## time   0.12506923 -0.2471142 -0.1608337 -0.15329435 0.01204692 -0.1272224
## timing 0.02231918 -0.1770893 -0.1355110 -0.09796838 0.02425462 -0.1356536
##             [,41]       [,42]     [,43]     [,44]      [,45]      [,46]
## time   -0.1678761 -0.02987162 0.2413628 0.1440598 -0.1231114 -0.2393287
## timing -0.1476323  0.06143197 0.1945098 0.1576042 -0.1439670 -0.2252752
##             [,47]       [,48]       [,49]      [,50]      [,51]      [,52]
## time   0.02120129 0.091888636 -0.02908193 -0.1292347 0.05076646 0.07122216
## timing 0.01870585 0.004292499 -0.07009085 -0.0798136 0.05554843 0.07525542
##            [,53]        [,54]      [,55]      [,56]      [,57]      [,58]
## time   0.2357274  0.043514103 -0.1376838 -0.1783522 0.06955821 -0.2771731
## timing 0.3132353 -0.009632835 -0.1674953 -0.1352489 0.08505522 -0.2401329
##              [,59]       [,60]      [,61]      [,62]      [,63]     [,64]
## time   -0.03030754 -0.10161730 -0.1649677 -0.2309162 -0.1236434 0.1572758
## timing -0.05090554 -0.08546982 -0.2037603 -0.2342789 -0.1370963 0.1491374
##              [,65]      [,66]         [,67]      [,68]       [,69]
## time   -0.02702036 -0.1185723 -0.0003240054 -0.3103080 -0.04891545
## timing -0.09487278 -0.1128573 -0.0590565316 -0.3185277 -0.08604440
##              [,70]      [,71]     [,72]       [,73]      [,74]    [,75]
## time    0.00150689 0.09819829 0.1082058 -0.09271113 -0.1486151 0.153983
## timing -0.03577995 0.10977877 0.1137564 -0.07488324 -0.1078962 0.125098
##              [,76]      [,77]     [,78]       [,79]     [,80]       [,81]
## time   -0.08267978 0.06911367 0.3051794 -0.02734402 0.4275725 0.006040023
## timing -0.02125590 0.09258043 0.2793728 -0.09959116 0.4338518 0.053337410
##            [,82]      [,83]      [,84]      [,85]      [,86]     [,87]
## time   0.2066989 0.01661408 0.12355842 0.09618524 -0.2163441 0.2479467
## timing 0.2127510 0.01258069 0.09404702 0.07762441 -0.2389865 0.2282655
##             [,88]       [,89]      [,90]     [,91]     [,92]       [,93]
## time   0.05690623 -0.03176846 -0.1284508 0.2303838 0.1115090 -0.02541525
## timing 0.04056602 -0.03184763 -0.1091274 0.2715702 0.1042816 -0.07882912
##            [,94]      [,95]       [,96]       [,97]       [,98]
## time   0.0708485 -0.2201677 -0.04842542 -0.05653475 -0.06495549
## timing 0.0992337 -0.1848852  0.02896625 -0.02126822 -0.07211553
##              [,99]     [,100]
## time   -0.13325213 -0.1850181
## timing -0.09662861 -0.1246495
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.03187904
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1194396 63.8    2164898 115.7  2164898 115.7
## Vcells 2537114 19.4    3942845  30.1  3942844  30.1