diff --git a/04-code-golf/tf-04-a.scala b/04-code-golf/tf-04-a.scala new file mode 100644 index 0000000..2dda0db --- /dev/null +++ b/04-code-golf/tf-04-a.scala @@ -0,0 +1,38 @@ +/** + Faithful conversion of tf-04.py to scala, avg execution time 5.2 seconds + $ time scala tf04a ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(soon,214) +(never,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m5.237s + +*/ +object tf04a extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val w = l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)) + w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) +} diff --git a/04-code-golf/tf-04-fold.scala b/04-code-golf/tf-04-fold.scala new file mode 100644 index 0000000..f1d2e69 --- /dev/null +++ b/04-code-golf/tf-04-fold.scala @@ -0,0 +1,42 @@ +/** +Attempt to speed up execution time: Avg 4.3 seconds +1. Use a compiled regex +2. accumulate tokens using a catamorphism + +$ time scala tf04fold ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(never,214) +(soon,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m4.392s +*/ +object tf04fold extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val p = java.util.regex.Pattern.compile("[^a-zA-Z]+") + val a:List[Array[String]] = Nil + val w = io.Source.fromFile(args(0)).getLines.foldLeft(a)((b,c)=> p.split(c).filter(x => (x.length > 0) && !s.contains(x.toLowerCase)) :: b).flatten + w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) +}