import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.SparkConf import org.apache.spark.SparkContextclass WordCount {
}
/**
- 处理目录下每个文件,进行wordcount计算,并将结果保存为list */ object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WordCount") val sc = new SparkContext(conf) var resultList = List // 保存结果集val fs = FileSystem.get(new java.net.URI("hdfs://cluster1"), new org.apache.hadoop.conf.Configuration()) .listStatus(new Path(args(0)))for (f <- fs) { println("YTQ-FilePath => " + f.getPath.toString) resultList = resultList ::: sc.textFile(f.getPath.toString). flatMap(_.split("\t")).map((_, 1)).reduceByKey(_ + _).collect.toList}// 再次处理最后的结果集sc.parallelize(resultList).reduceByKey(_ + _).saveAsTextFile(args(1))sc.stop()
}
}