from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA
df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
[2, SparseVector(2, {0: 1.0})],], ["id", "features"])
lda = LDA(k=2, seed=1, optimizer="em")
model = lda.fit(df)
model.isDistributed()
localModel = model.toLocal()
localModel.isDistributed()
model.vocabSize()
model.describeTopics().show()
model.topicsMatrix()
lda_path = temp_path + "/lda"
lda.save(lda_path)
sameLDA = LDA.load(lda_path)
distributed_model_path = temp_path + "/lda_distributed_model"
model.save(distributed_model_path)
sameModel = DistributedLDAModel.load(distributed_model_path)
local_model_path = temp_path + "/lda_local_model"
localModel.save(local_model_path)
sameLocalModel = LocalLDAModel.load(local_model_path)