compute the inverse document frequency

Solutions on MaxInterview for compute the inverse document frequency by the best coders in the world

showing results for - "compute the inverse document frequency"
Alex
07 Sep 2017
1# Compute the Inverse Document Frequency (IDF)
2
3from pyspark.ml.linalg import DenseVector
4df = spark.createDataFrame([(DenseVector([1.0, 2.0]),), 
5                            (DenseVector([0.0, 1.0]),), 
6                            (DenseVector([3.0, 0.2]),)], ["tf"])
7idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
8model = idf.fit(df)
9model.idf
10# DenseVector([0.0, 0.0])
11model.transform(df).head().idf
12# DenseVector([0.0, 0.0])
13idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
14# DenseVector([0.0, 0.0])
15params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
16idf.fit(df, params).transform(df).head().vector
17# DenseVector([0.2877, 0.0])
18idfPath = temp_path + "/idf"
19idf.save(idfPath)
20loadedIDF = IDF.load(idfPath)
21loadedIDF.getMinDocFreq() == idf.getMinDocFreq()
22# True
23modelPath = temp_path + "/idf-model"
24model.save(modelPath)
25loadedModel = IDFModel.load(modelPath)
26loadedModel.transform(df).head().idf == model.transform(df).head().idf
27# True