本文共 1200 字,大约阅读时间需要 4 分钟。
val mydata = Array(Array(1,3,4,5),Array(2,3,5),Array(1,2,3,4,5),Array(2,3,4,5))val pamydata = sc.parallelize(mydata)val C1 = pamydata.flatMap(_.toSet).distinct().collect().map(Set(_))val D = mydata.map(_.toSet)val D_bc = sc.broadcast(D)val length = mydata.lengthvar limit = 0.70def f1(a:Set[Int],B:Array[Set[Int]],length:Int,limit:Double) = {if(B.filter(b => a.subsetOf(b)).size/length.toDouble >= limit)(a,B.filter(b => a.subsetOf(b)).size/length.toDouble)}var suppdata = sc.parallelize(C1).map(f1(_,D_bc.value,4,limit)).filter(_.!=(())).collect()var L = Array[Array[Set[Int]]]()val L1 = suppdata.map(_ match{case a:Tuple2[_,_] => a._1 match{ case b:Set[_] => b.asInstanceOf[Set[Int]]}})L = L :+ L1var k=2while(L(k-2).length>0){var CK = Array[Set[Int]]()for((var1,index) <- L(k-2).zipWithIndex;var2 <- L(k-2).drop(index+1) if var1.take(k-2).equals(var2.take(k-2))){CK= CK :+ (var1|var2)}val suppdata_temp = sc.parallelize(CK).map(f1(_,D_bc.value,4,limit)).filter(_.!=(())).collect()suppdata = suppdata :+ suppdata_tempL = L :+ suppdata_temp.map(_ match{case a:Tuple2[_,_] => a._1 match{ case b:Set[_] => b.asInstanceOf[Set[Int]]}})k += 1}L = L.filter(_.nonEmpty)Lsuppdata这里只写了挖掘频繁项集,发现关联规则的代码可以参考来写
转载地址:http://imnrj.baihongyu.com/