From 954d6d24756c5d10f982465aeb7d512acb9cebbd Mon Sep 17 00:00:00 2001 From: krishnanraman Date: Mon, 14 Oct 2013 15:52:34 -0700 Subject: [PATCH 1/4] Scala versions for codegolg --- 04-code-golf/tf-04-a.scala | 38 +++++++++++++++++++++++++++++++ 04-code-golf/tf-04-fold.scala | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 04-code-golf/tf-04-a.scala create mode 100644 04-code-golf/tf-04-fold.scala diff --git a/04-code-golf/tf-04-a.scala b/04-code-golf/tf-04-a.scala new file mode 100644 index 0000000..2dda0db --- /dev/null +++ b/04-code-golf/tf-04-a.scala @@ -0,0 +1,38 @@ +/** + Faithful conversion of tf-04.py to scala, avg execution time 5.2 seconds + $ time scala tf04a ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(soon,214) +(never,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m5.237s + +*/ +object tf04a extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val w = l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)) + w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) +} diff --git a/04-code-golf/tf-04-fold.scala b/04-code-golf/tf-04-fold.scala new file mode 100644 index 0000000..f1d2e69 --- /dev/null +++ b/04-code-golf/tf-04-fold.scala @@ -0,0 +1,42 @@ +/** +Attempt to speed up execution time: Avg 4.3 seconds +1. Use a compiled regex +2. accumulate tokens using a catamorphism + +$ time scala tf04fold ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(never,214) +(soon,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m4.392s +*/ +object tf04fold extends App { + def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") + val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val p = java.util.regex.Pattern.compile("[^a-zA-Z]+") + val a:List[Array[String]] = Nil + val w = io.Source.fromFile(args(0)).getLines.foldLeft(a)((b,c)=> p.split(c).filter(x => (x.length > 0) && !s.contains(x.toLowerCase)) :: b).flatten + w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) +} From 739371b8817d7e4d8a919f062eee2236ef161b57 Mon Sep 17 00:00:00 2001 From: krishnanraman Date: Mon, 14 Oct 2013 15:53:05 -0700 Subject: [PATCH 2/4] Scala versions for codegolf --- 04-code-golf/tf-04-fold.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/04-code-golf/tf-04-fold.scala b/04-code-golf/tf-04-fold.scala index f1d2e69..cda1bb1 100644 --- a/04-code-golf/tf-04-fold.scala +++ b/04-code-golf/tf-04-fold.scala @@ -1,5 +1,5 @@ /** -Attempt to speed up execution time: Avg 4.3 seconds +Attempt to speed up execution time: Avg 4.4 seconds 1. Use a compiled regex 2. accumulate tokens using a catamorphism From b909a80184ba39adb7f13be0e81929f43897bedd Mon Sep 17 00:00:00 2001 From: krishnanraman Date: Mon, 14 Oct 2013 20:50:14 -0700 Subject: [PATCH 3/4] fastest implementation so far at 0.9sec --- 04-code-golf/tf-04-map.scala | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 04-code-golf/tf-04-map.scala diff --git a/04-code-golf/tf-04-map.scala b/04-code-golf/tf-04-map.scala new file mode 100644 index 0000000..8d30a31 --- /dev/null +++ b/04-code-golf/tf-04-map.scala @@ -0,0 +1,53 @@ +/** +Attempt to speed up execution time: Avg 0.9 seconds +1. Use a compiled regex +2. accumulate tokens using a catamorphism +3. count tokens using a 2nd catamorphism + +$ time scala tf04map ../pride-and-prejudice.txt +(Mr,786) +(Elizabeth,635) +(very,473) +(Darcy,417) +(such,378) +(Mrs,343) +(much,325) +(more,325) +(Bennet,322) +(Bingley,305) +(Jane,295) +(Miss,281) +(one,261) +(know,239) +(herself,227) +(before,225) +(sister,218) +(soon,214) +(never,214) +(though,212) +(think,210) +(time,203) +(now,197) +(Wickham,194) +(well,188) + +real 0m0.882s +*/ +object tf04map extends App { + def l(f:String) = io.Source.fromFile(f).getLines + val s = l("../stop_words.txt").mkString(",").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) + val p = java.util.regex.Pattern.compile("[^a-zA-Z]+") + l(args(0)).foldLeft(Map[String,Int]()){ + (b,c) => + p + .split(c) + .filter(x => (x.length > 0) && !s.contains(x.toLowerCase)) + .foldLeft(b){ + (d,e) => + d ++ Map(e -> (d.getOrElse(e,0)+1)) + } + }.toSeq + .sortBy(- _._2) + .take(25) + .foreach(println) +} From 0f3bb8bb5bba61ac890d421b80ea4e93a2e74a07 Mon Sep 17 00:00:00 2001 From: krishnanraman Date: Mon, 21 Oct 2013 11:32:42 -0700 Subject: [PATCH 4/4] moved 2 files to the-one --- 04-code-golf/{tf-04-a.scala => tf04.scala} | 11 ++++++++--- {04-code-golf => 09-the-one}/tf-04-fold.scala | 0 {04-code-golf => 09-the-one}/tf-04-map.scala | 0 3 files changed, 8 insertions(+), 3 deletions(-) rename 04-code-golf/{tf-04-a.scala => tf04.scala} (70%) rename {04-code-golf => 09-the-one}/tf-04-fold.scala (100%) rename {04-code-golf => 09-the-one}/tf-04-map.scala (100%) diff --git a/04-code-golf/tf-04-a.scala b/04-code-golf/tf04.scala similarity index 70% rename from 04-code-golf/tf-04-a.scala rename to 04-code-golf/tf04.scala index 2dda0db..abe0aba 100644 --- a/04-code-golf/tf-04-a.scala +++ b/04-code-golf/tf04.scala @@ -30,9 +30,14 @@ real 0m5.237s */ -object tf04a extends App { +object tf04 extends App { def l(f:String) = io.Source.fromFile(f).getLines.mkString(",") val s = l("../stop_words.txt").split(",") ++ (1 to 26).map(i=>String.valueOf(Character.toChars(96+i))) - val w = l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)) - w.distinct.map(u=> (u,w.count(_==u))).sortBy(-_._2).take(25).foreach(println) + + l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)) + .distinct + .map(u=> (u,l(args(0)).split("[^a-zA-Z]+").filter(x => !s.contains(x.toLowerCase)).count(_==u))) + .sortBy(-_._2) + .take(25) + .foreach(println) } diff --git a/04-code-golf/tf-04-fold.scala b/09-the-one/tf-04-fold.scala similarity index 100% rename from 04-code-golf/tf-04-fold.scala rename to 09-the-one/tf-04-fold.scala diff --git a/04-code-golf/tf-04-map.scala b/09-the-one/tf-04-map.scala similarity index 100% rename from 04-code-golf/tf-04-map.scala rename to 09-the-one/tf-04-map.scala