diff --git a/04-code-golf/tf04.scala b/04-code-golf/tf04.scala new file mode 100644 index 0000000..abe0aba --- /dev/null +++ b/04-code-golf/tf04.scala @@ -0,0 +1,43 @@ +/** + Faithful conversion of tf-04.py to scala, avg execution time 5.2 seconds + $ time scala tf04a ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(soon,214) +(never,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m5.237s + +*/ +object tf04 extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + + l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)) + .distinct + .map(u=> (u,l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)).count(_==u))) + .sortBy(-_._2) + .take(25) + .foreach(println) +} diff --git a/09-the-one/tf-04-fold.scala b/09-the-one/tf-04-fold.scala new file mode 100644 index 0000000..cda1bb1 --- /dev/null +++ b/09-the-one/tf-04-fold.scala @@ -0,0 +1,42 @@ +/** +Attempt to speed up execution time: Avg 4.4 seconds +1. Use a compiled regex +2. accumulate tokens using a catamorphism + +$ time scala tf04fold ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(never,214) +(soon,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m4.392s +*/ +object tf04fold extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val p = java.util.regex.Pattern.compile("[^a-zA-Z]+") + val a:List[Array[String]] = Nil + val w = io.Source.fromFile(args(0)).getLines.foldLeft(a)((b,c)=> p.split(c).filter(x => (x.length > 0) && !s.contains(x.toLowerCase)) :: b).flatten + w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) +} diff --git a/09-the-one/tf-04-map.scala b/09-the-one/tf-04-map.scala new file mode 100644 index 0000000..8d30a31 --- /dev/null +++ b/09-the-one/tf-04-map.scala @@ -0,0 +1,53 @@ +/** +Attempt to speed up execution time: Avg 0.9 seconds +1. Use a compiled regex +2. accumulate tokens using a catamorphism +3. count tokens using a 2nd catamorphism + +$ time scala tf04map ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(soon,214) +(never,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m0.882s +*/ +object tf04map extends App { + def l(f:String) = io.Source.fromFile(f).getLines + val s = l("../stop_words.txt").mkString(",").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val p = java.util.regex.Pattern.compile("[^a-zA-Z]+") + l(args(0)).foldLeft(Map[String,Int]()){ + (b,c) => + p + .split(c) + .filter(x => (x.length > 0) && !s.contains(x.toLowerCase)) + .foldLeft(b){ + (d,e) => + d ++ Map(e -> (d.getOrElse(e,0)+1)) + } + }.toSeq + .sortBy(- _._2) + .take(25) + .foreach(println) +}