Unsupervised learning
M. Benesty
2017-09-15
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 18594 lr: 0.000000 loss: 2.666634 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.2646178 0.12198758 0.05387458 -0.001059301 0.05033693 0.009204047
## timing 0.2127367 0.03916844 0.01094351 0.066450834 0.01902333 0.014029122
## [,7] [,8] [,9] [,10] [,11]
## time 0.1006525 -0.05165429 -0.1867511 -0.09149687 -0.2140105
## timing 0.1037399 -0.02082206 -0.1845313 -0.10030819 -0.2252076
## [,12] [,13] [,14] [,15] [,16] [,17]
## time -0.002969536 0.04354328 0.02035887 0.1079958 0.13126923 -0.1849994
## timing 0.013844747 0.10126358 0.07640997 0.1328659 0.09676559 -0.1320701
## [,18] [,19] [,20] [,21] [,22]
## time -0.03653751 -0.06697740 0.11199731 -0.1125299 -0.08539391
## timing -0.00506349 -0.06391282 0.08757181 -0.1336615 -0.06963135
## [,23] [,24] [,25] [,26] [,27] [,28]
## time 0.09793303 0.06386217 -0.07443064 -0.09063132 0.3675116 -0.4145926
## timing 0.08923492 0.06158390 -0.05012904 -0.11666670 0.2799409 -0.4010045
## [,29] [,30] [,31] [,32] [,33] [,34]
## time 0.1636032 0.1763889 -0.1456650 0.009296339 -0.4786170 0.2489249
## timing 0.1788498 0.1866831 -0.1321961 -0.036398493 -0.5467163 0.3174041
## [,35] [,36] [,37] [,38] [,39] [,40]
## time 0.12506923 -0.2471142 -0.1608337 -0.15329435 0.01204692 -0.1272224
## timing 0.02231918 -0.1770893 -0.1355110 -0.09796838 0.02425462 -0.1356536
## [,41] [,42] [,43] [,44] [,45] [,46]
## time -0.1678761 -0.02987162 0.2413628 0.1440598 -0.1231114 -0.2393287
## timing -0.1476323 0.06143197 0.1945098 0.1576042 -0.1439670 -0.2252752
## [,47] [,48] [,49] [,50] [,51] [,52]
## time 0.02120129 0.091888636 -0.02908193 -0.1292347 0.05076646 0.07122216
## timing 0.01870585 0.004292499 -0.07009085 -0.0798136 0.05554843 0.07525542
## [,53] [,54] [,55] [,56] [,57] [,58]
## time 0.2357274 0.043514103 -0.1376838 -0.1783522 0.06955821 -0.2771731
## timing 0.3132353 -0.009632835 -0.1674953 -0.1352489 0.08505522 -0.2401329
## [,59] [,60] [,61] [,62] [,63] [,64]
## time -0.03030754 -0.10161730 -0.1649677 -0.2309162 -0.1236434 0.1572758
## timing -0.05090554 -0.08546982 -0.2037603 -0.2342789 -0.1370963 0.1491374
## [,65] [,66] [,67] [,68] [,69]
## time -0.02702036 -0.1185723 -0.0003240054 -0.3103080 -0.04891545
## timing -0.09487278 -0.1128573 -0.0590565316 -0.3185277 -0.08604440
## [,70] [,71] [,72] [,73] [,74] [,75]
## time 0.00150689 0.09819829 0.1082058 -0.09271113 -0.1486151 0.153983
## timing -0.03577995 0.10977877 0.1137564 -0.07488324 -0.1078962 0.125098
## [,76] [,77] [,78] [,79] [,80] [,81]
## time -0.08267978 0.06911367 0.3051794 -0.02734402 0.4275725 0.006040023
## timing -0.02125590 0.09258043 0.2793728 -0.09959116 0.4338518 0.053337410
## [,82] [,83] [,84] [,85] [,86] [,87]
## time 0.2066989 0.01661408 0.12355842 0.09618524 -0.2163441 0.2479467
## timing 0.2127510 0.01258069 0.09404702 0.07762441 -0.2389865 0.2282655
## [,88] [,89] [,90] [,91] [,92] [,93]
## time 0.05690623 -0.03176846 -0.1284508 0.2303838 0.1115090 -0.02541525
## timing 0.04056602 -0.03184763 -0.1091274 0.2715702 0.1042816 -0.07882912
## [,94] [,95] [,96] [,97] [,98]
## time 0.0708485 -0.2201677 -0.04842542 -0.05653475 -0.06495549
## timing 0.0992337 -0.1848852 0.02896625 -0.02126822 -0.07211553
## [,99] [,100]
## time -0.13325213 -0.1850181
## timing -0.09662861 -0.1246495
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.03187904
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1194396 63.8 2164898 115.7 2164898 115.7
## Vcells 2537114 19.4 3942845 30.1 3942844 30.1