@@ -4,7 +4,6 @@ import org.apache.spark.SparkConf
4
4
import org .apache .spark .SparkContext
5
5
import org .apache .spark .SparkContext .*
6
6
import org .apache .log4j .{Logger , Level }
7
-
8
7
import org .apache .spark .rdd .RDD
9
8
import scala .util .Properties .isWin
10
9
@@ -17,10 +16,7 @@ case class WikipediaArticle(title: String, text: String):
17
16
def mentionsLanguage (lang : String ): Boolean = text.split(' ' ).contains(lang)
18
17
19
18
object WikipediaRanking extends WikipediaRankingInterface :
20
- // Reduce Spark logging verbosity
21
- Logger
22
- .getLogger(" org.apache.spark" )
23
- .setLevel(Level .ERROR )
19
+ Logger .getLogger(" org.apache.spark" ).setLevel(Level .ERROR ) // Reduce Spark log verbosity
24
20
25
21
if isWin then
26
22
System .setProperty(
@@ -46,21 +42,17 @@ object WikipediaRanking extends WikipediaRankingInterface:
46
42
" Groovy"
47
43
)
48
44
49
- val conf : SparkConf = // TODO
50
- new SparkConf ()
51
- .setMaster(" local[4]" )
52
- .setAppName(" MyApp" )
53
-
45
+ val conf : SparkConf = new SparkConf ().setMaster(" local[4]" ).setAppName(" MyApp" ) // TODO
54
46
val sc : SparkContext = new SparkContext (conf) // TODO
55
47
56
48
// Hint: use a combination of `sc.parallelize`,
57
49
// `WikipediaData.lines` and `WikipediaData.parse`
58
50
val wikiRdd : RDD [WikipediaArticle ] = // TODO
59
51
sc.parallelize(WikipediaData .lines map WikipediaData .parse, 4 )
60
52
61
- /** Returns the number of articles on which the language `lang` occurs. Hint1:
62
- * consider using method `aggregate` on RDD[T]. Hint2: consider using method
63
- * `mentionsLanguage` on `WikipediaArticle`
53
+ /** Returns the number of articles on which the language `lang` occurs. Hint1: consider
54
+ * using method `aggregate` on RDD[T]. Hint2: consider using method `mentionsLanguage`
55
+ * on `WikipediaArticle`
64
56
*/
65
57
def occurrencesOfLang (lang : String , rdd : RDD [WikipediaArticle ]): Int = // TODO
66
58
val fun = (count : Int , article : WikipediaArticle ) =>
@@ -79,22 +71,18 @@ object WikipediaRanking extends WikipediaRankingInterface:
79
71
langs : List [String ],
80
72
rdd : RDD [WikipediaArticle ]
81
73
): List [(String , Int )] = // TODO
82
- val langRanks =
83
- for lang <- langs
84
- yield (lang, occurrencesOfLang(lang, rdd))
85
-
74
+ val langRanks = for lang <- langs yield (lang, occurrencesOfLang(lang, rdd))
86
75
langRanks.sortBy(- _._2)
87
76
88
- /* Compute an inverted index of the set of articles, mapping each language
89
- * to the Wikipedia pages in which it occurs.
77
+ /* Compute an inverted index of the set of articles, mapping each language to the
78
+ * Wikipedia pages in which it occurs.
90
79
*/
91
80
def makeIndex (
92
81
langs : List [String ],
93
82
rdd : RDD [WikipediaArticle ]
94
83
): RDD [(String , Iterable [WikipediaArticle ])] = // TODO
95
84
val fun = (article : WikipediaArticle ) =>
96
- for lang <- langs filter article.mentionsLanguage
97
- yield (lang, article)
85
+ for lang <- langs filter article.mentionsLanguage yield (lang, article)
98
86
99
87
rdd // flatMap, for/yield, filter, groupByKey are LAZY
100
88
.flatMap(fun) // RDD[(String, WikipediaArticle)]
@@ -129,8 +117,7 @@ object WikipediaRanking extends WikipediaRankingInterface:
129
117
rdd : RDD [WikipediaArticle ]
130
118
): List [(String , Int )] = // TODO
131
119
val fun = (article : WikipediaArticle ) =>
132
- for lang <- langs filter article.mentionsLanguage
133
- yield (lang, 1 )
120
+ for lang <- langs filter article.mentionsLanguage yield (lang, 1 )
134
121
135
122
rdd
136
123
.flatMap(fun)
@@ -139,30 +126,24 @@ object WikipediaRanking extends WikipediaRankingInterface:
139
126
.collect // LAZY up to here!
140
127
.toList
141
128
142
- def main (args : Array [String ]): Unit =
143
-
144
- /* Languages ranked according to (1) */
145
- val langsRanked : List [(String , Int )] =
129
+ def main : Unit =
130
+ val langsRanked : List [(String , Int )] = /* Languages ranked according to (1) */
146
131
timed(" Part 1: naive ranking" , rankLangs(langs, wikiRdd))
147
132
148
- /* An inverted index mapping languages to wikipedia pages
149
- * on which they appear */
150
- def index : RDD [(String , Iterable [WikipediaArticle ])] =
151
- makeIndex(langs, wikiRdd)
133
+ /* An inverted index mapping languages to wikipedia pages on which they appear */
134
+ def index : RDD [(String , Iterable [WikipediaArticle ])] = makeIndex(langs, wikiRdd)
152
135
153
136
/* Languages ranked according to (2), using the inverted index */
154
137
val langsRanked2 : List [(String , Int )] =
155
138
timed(" Part 2: ranking using inverted index" , rankLangsUsingIndex(index))
156
139
157
- /* Languages ranked according to (3) */
158
- val langsRanked3 : List [(String , Int )] =
140
+ val langsRanked3 : List [(String , Int )] = /* Languages ranked according to (3) */
159
141
timed(
160
142
" Part 3: ranking using reduceByKey" ,
161
143
rankLangsReduceByKey(langs, wikiRdd)
162
144
)
163
145
164
- /* Output the speed of each ranking */
165
- println(timing)
146
+ println(timing) /* Output the speed of each ranking */
166
147
sc.stop()
167
148
168
149
val timing = new StringBuffer
0 commit comments