Army of Awesome/ElasticSearch Prototype

From MozillaWiki
Jump to: navigation, search

Overview

Metrics has created a five node ElasticSearch cluster on some test machines that is using the Twitter River functionality to automatically retrieve a filtered set of documents from Twitter's streaming API and index them.

A very useful tool I found for playing around with this is RESTClient, a Java GUI that allows you to edit requests, view responses, and save both.

Sample 1

Sample of potential support elegible tweets from last week with three facets on number of tweets per day, number of replies per day, and top 16 contributors for the week.

Notes

  • The histogram facets return keys as an epoch. I tried to use the script syntax to force it to be a date string, but it would be better to have the client create a date object from the epoch. Both the natural and the script version return nonsensical values for "total" and "mean".
  • Mix of Lucene format query_string and ElasticSearch Query DSL syntax just for example purposes.
  • The non-filter part of the query could be used to provide boost scores (The ^2 ^0.5 syntax) when looking for relevant tweets. Even then, care must be taken because parsing/searching text is hard. In this sample, there are tweets that use the term "hang" in the context "get the hang of". Very different from what we'd first assume when searching for the term hang.
  • For the contributor/reply facets, we use the "global" attribute to specify that these facets shouldn't be restricted by the main query. They provide their own filter queries for #fxhelp.

Query

http://<server>.mozilla.org:9200/twitter_river/status/_search

{
    "from":0,"size":10,
    "query": {
        "filtered": {
            "query": {
                "query_string" : {
                    "query": "text:(crash^4~ beta^2 help hang^0.5~)"
                }
            },
            "filter": {
                "query": {
                    "query_string": {
                        "query": "+created_at: [2010-11-01 TO 2010-11-08] +(text:firefox hashtag:fx) -(text:RT* link:* mention.id:*)"
                    }
                }
            }
        }
    },
    "facets": {
        "fxhelp_elegible_tweets_per_day_last_week": {
            "histogram": {
                "field": "created_at",
                "time_interval": "1d"
            }
        },
        "contributors": {
            "terms": {
                "field":"user.screen_name",
                "size": 16
            },
            "facet_filter": {
                "and": {
                    "filters" : [
                        { "term": { "hashtag":"fxhelp" } },
                        { "range": { "created_at": { "from": "2010-11-01", "to": "2010-11-08" } } }
                    ]
                }
            },
            "global": true
        },
        "fxhelp_replies_per_day_last_week": {
            "histogram": {
                "key_script": "doc['created_at'].date.year * 10000 + doc['created_at'].date.monthOfYear * 100 + doc['created_at'].date.dayOfMonth",
                "value_script": "1"
            },
            "facet_filter": {
                "and": {
                    "filters" : [
                        { "term": { "hashtag":"fxhelp" } },
                        { "range": { "created_at": { "from": "2010-11-01", "to": "2010-11-08" } } }
                    ]
                }
            },
            "global": true
        }
    }
}

Results

{
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 958,
    "max_score" : 0.3623709,
    "hits" : [ {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "909430837223424",
      "_score" : 0.3623709,
      "_source" : {
        "text" : "OMG FIREFOX HANG NO DONT CRASH DAMMIT",
        "created_at" : "2010-11-06T13:56:39.000Z",
        "source" : "<a href=\"http://www.tweetdeck.com\" rel=\"nofollow\">TweetDeck</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 186444452,
          "name" : "Weeeee",
          "screen_name" : "shootyou_down",
          "location" : "Singapore",
          "description" : "A SGSones who loves Soshi and the cute-dork-kid-leader-taeyeon :)\r\n\r\nProfile Picture by flying petals :)"
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "1319489731829760",
      "_score" : 0.3226243,
      "_source" : {
        "text" : "e crasha firefox",
        "created_at" : "2010-11-07T17:06:05.000Z",
        "source" : "<a href=\"http://www.twhirl.org\" rel=\"nofollow\">Seesmic twhirl</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 16410554,
          "name" : "Salmen Bejaoui",
          "screen_name" : "MoukaDesign",
          "location" : "Moniga Del Garda ( BS )",
          "description" : "I'm Tunisian Web / Graphic Designer , living in Italy"
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "809541205762048",
      "_score" : 0.30075088,
      "_source" : {
        "text" : "takteng yan... nag crash ang firefox!! back to zero ang streaming~~~",
        "created_at" : "2010-11-06T07:19:44.000Z",
        "source" : "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 60346988,
          "name" : "멜로디 (Melody Joy) ",
          "screen_name" : "melodyjoy42",
          "location" : "Philippines (필리핀)",
          "description" : "a teacher, and A CERTIFIED K-POP LOVER!!  인생은 멜로디 없으면... 재미 없었어... ^____^\r\n"
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "905541014519808",
      "_score" : 0.23943175,
      "_source" : {
        "text" : "Apparently there's a bug that  causes #WebGL to crash the open tab in Mobile Firefox 4 beta 2 on my n900",
        "created_at" : "2010-11-06T13:41:12.000Z",
        "source" : "<a href=\"http://www.tweetdeck.com\" rel=\"nofollow\">TweetDeck</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ "WebGL" ],
        "link" : [ ],
        "user" : {
          "id" : 24777027,
          "name" : "Lars Gunther",
          "screen_name" : "itpastorn",
          "location" : "Sweden",
          "description" : "itpastorn = The IT pastor. Preaching the gospel of Christ and of web standards. Member of WaSP Education task Force. madly in love with his wife, @penilla."
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "946131257262080",
      "_score" : 0.23732686,
      "_source" : {
        "text" : "y u so crashy Firefox? :<",
        "created_at" : "2010-11-06T16:22:29.000Z",
        "source" : "<a href=\"http://www.tweetdeck.com\" rel=\"nofollow\">TweetDeck</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 19907202,
          "name" : "Katt",
          "screen_name" : "sarcasticKatt",
          "location" : "",
          "description" : "Wait, what?"
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "29585280857",
      "_score" : 0.21502091,
      "_source" : {
        "text" : "today is #firefox crash day.",
        "created_at" : "2010-11-03T16:14:32.000Z",
        "source" : "<a href=\"http://www.tweetdeck.com\" rel=\"nofollow\">TweetDeck</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ "firefox" ],
        "link" : [ ],
        "user" : {
          "id" : 5997632,
          "name" : "Bruce Gilbert",
          "screen_name" : "webguync",
          "location" : "Durham, NC",
          "description" : "web and multimedia developer. Always learning new technologies and better ways of doing things. Lover of art, live music and all things fitness related."
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "1214312047710208",
      "_score" : 0.19917017,
      "_source" : {
        "text" : "La nuova versione di #Firefox crasha un pò troppo",
        "created_at" : "2010-11-07T10:08:09.000Z",
        "source" : "web",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ "Firefox" ],
        "place" : {
          "id" : "197ec7280821d2d4",
          "name" : "Vescovato",
          "type" : "city",
          "full_name" : "Vescovato, Cremona",
          "street_address" : null,
          "country" : "Italy",
          "country_code" : "IT",
          "url" : "http://api.twitter.com/1/geo/id/197ec7280821d2d4.json"
        },
        "link" : [ ],
        "user" : {
          "id" : 123884731,
          "name" : "Carla Gobbi",
          "screen_name" : "carlagobbi81",
          "location" : "ITALY Parma & Cremona",
          "description" : "Iphone Dev App: iStress!\r\nLOVE IN F1, MOTOMONDIALE, JUVENTUS & TV! TV! TV! "
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "29392238772",
      "_score" : 0.1922365,
      "_source" : {
        "text" : "Dear Firefox, Do NOT crash again. >.>",
        "created_at" : "2010-11-01T17:36:12.000Z",
        "source" : "web",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 43575045,
          "name" : "Nati Planas",
          "screen_name" : "StarlightNati",
          "location" : "NoLongerNorthBumblefuck,US",
          "description" : "Name's Nati. Spina bifida kid. Paralyzed from the waist down. American by birth, Uruguayan by blood. Music is awesome. Writing is fun. =]"
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "29414215890",
      "_score" : 0.19168442,
      "_source" : {
        "text" : "Firefox goes crash",
        "created_at" : "2010-11-01T22:56:43.000Z",
        "source" : "web",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 18045477,
          "name" : "Bradley",
          "screen_name" : "BradSabbath",
          "location" : "Sydney",
          "description" : "Geek; Socially and Sexually Retarded; Totally Inept with Women; Zombie/Horror film addict; Black Sabbath junkie; "
        }
      }
    }, {
      "_index" : "twitter_river",
      "_type" : "status",
      "_id" : "29571026801",
      "_score" : 0.18289806,
      "_source" : {
        "text" : "FirefoxがCrashした",
        "created_at" : "2010-11-03T13:29:42.000Z",
        "source" : "<a href=\"http://sourceforge.jp/projects/tween/wiki/FrontPage\" rel=\"nofollow\">Tween</a>",
        "truncated" : false,
        "mention" : [ ],
        "hashtag" : [ ],
        "link" : [ ],
        "user" : {
          "id" : 6996352,
          "name" : "Library",
          "screen_name" : "libraryp",
          "location" : "Japan, Kanto",
          "description" : "Mobiler/WORDian/(PSP|Arcade) DIVA Player/jubeat Player/α55ユーザ/車載er/ニコマス好き/趣味は録画とエンコード。関東のどこかを転々としているらしい。フォローは過去の発言を読んでから慎重に行ってくださいね?"
        }
      }
    } ]
  },
  "facets" : {
    "fxhelp_elegible_tweets_per_day_last_week" : {
      "_type" : "histogram",
      "_key_field" : "created_at",
      "_value_field" : "created_at",
      "_comparator" : "key",
      "_interval" : 86400000,
      "entries" : [ {
        "key" : 1288569600000,
        "count" : 140,
        "total" : 1.80407013715E14,
        "mean" : 1.2886215265357144E12
      }, {
        "key" : 1288656000000,
        "count" : 177,
        "total" : 2.28100515428E14,
        "mean" : 1.2887034769943503E12
      }, {
        "key" : 1288742400000,
        "count" : 198,
        "total" : 2.55180310288E14,
        "mean" : 1.28878944589899E12
      }, {
        "key" : 1288828800000,
        "count" : 10,
        "total" : 1.2888322717E13,
        "mean" : 1.2888322717E12
      }, {
        "key" : 1288915200000,
        "count" : 53,
        "total" : 6.8316753411E13,
        "mean" : 1.2889953473773584E12
      }, {
        "key" : 1289001600000,
        "count" : 198,
        "total" : 2.55231969526E14,
        "mean" : 1.289050351141414E12
      }, {
        "key" : 1289088000000,
        "count" : 182,
        "total" : 2.34622296403E14,
        "mean" : 1.2891334967197803E12
      } ]
    },
    "contributors" : {
      "_type" : "terms",
      "_field" : "user.screen_name",
      "terms" : [ {
        "term" : "pravin3832",
        "count" : 237
      }, {
        "term" : "dailycavalier",
        "count" : 18
      }, {
        "term" : "ossreleasefeed",
        "count" : 17
      }, {
        "term" : "rerlin6a",
        "count" : 15
      }, {
        "term" : "michaelverdi",
        "count" : 13
      }, {
        "term" : "mozillaph",
        "count" : 9
      }, {
        "term" : "rymate1234",
        "count" : 7
      }, {
        "term" : "firefoxsux",
        "count" : 5
      }, {
        "term" : "djst",
        "count" : 5
      }, {
        "term" : "ndaru",
        "count" : 2
      }, {
        "term" : "ma",
        "count" : 2
      }, {
        "term" : "kuemerle5",
        "count" : 2
      }, {
        "term" : "seragkhaled",
        "count" : 1
      }, {
        "term" : "sandydolphinmj",
        "count" : 1
      }, {
        "term" : "rifqiyoshioka",
        "count" : 1
      }, {
        "term" : "pimmhogeling",
        "count" : 1
      } ]
    },
    "fxhelp_replies_per_day_last_week" : {
      "_type" : "histogram",
      "_key_field" : "_na",
      "_value_field" : "_na",
      "_comparator" : "key",
      "_interval" : -1,
      "entries" : [ {
        "key" : 20101031,
        "count" : 40,
        "total" : 40.0,
        "mean" : 1.0
      }, {
        "key" : 20101101,
        "count" : 143,
        "total" : 143.0,
        "mean" : 1.0
      }, {
        "key" : 20101102,
        "count" : 80,
        "total" : 80.0,
        "mean" : 1.0
      }, {
        "key" : 20101103,
        "count" : 31,
        "total" : 31.0,
        "mean" : 1.0
      }, {
        "key" : 20101106,
        "count" : 45,
        "total" : 45.0,
        "mean" : 1.0
      } ]
    }
  }
}