SlideShare a Scribd company logo
EKAW 2010 • Tutorial T3
  Friday • 15th october 2010


Knowledge Acquisition from Social Networking Sites
Z. Zhang, A.E. Cano, K. Elbedweihy, A.-S. Dadzie
!"#$%&'()%"*                                                                          9*36:3+*.%',+1++/6"4*;;;*
!"#$%&'&()#$&%#$%*&$#+,&*%-.%"&/0%1.2%333%                                            4./*&(%$-(2)-2(&%5%-.0%/&6&/%7%+1-2<1-,"-=+>+$(6,+,?*
                                                                                      •! &-#-*
•! '"&+$,#-"&*#.+*/$%(+&'$+*%0*1"%23+&4+*-(5'6,6)%"*                                     –! data/animalcorpus/!
   0$%7*,%(6-3*"+#2%$16"4*,6#+,*                                                         –! data/examples/!
                                                                                         –! data/corpora/facebook_data | twitter_data/*
•! 3+-$"*#%*',+*$+3+8-"#*#%%3,*#%*-(5'6$+*6"0%$7-)%"*-"&*                             •! (%&+*
   1"%23+&4+*0$%7*,%(6-3*"+#2%$16"4*,6#+,*                                               –! facebook/!
                                                                                         –! twitter/!
•! ($+-#+*-*,67/3+*-//36(-)%"*#%*&+7%",#$-#+*#.+*                                        –! information_extraction/ekawtutorial/ | jatr_v1.0/*
   #+(."%3%46+,*6"*/$-()(+*                                                           •! +>#+$"-3*36@$-$6+,*
                                                                                         –! lib/!


                                                                                      •! &%2"3%-&,*0$%7*#'#%$6-3*2+@,6#+*
                                                                                         http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/
                                                                                         tutorial_prep.html#exercise_downloads
                                                                                         http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/
                                                                                         tutorial_prep.html#third_party_downloads




9*36:3+*.%',+1++/6"4*;;;*                                                             9*36:3+*.%',+1++/6"4*;;;*
82,,#,+%-"&%900/#)9:.,$%                                                              B$#,+%9,-%
•! A+,#*6"#+$"+#*(%""+()%"*
    –! #%*$'"*0-(+@%%1*-"&*#26:+$*+>-7/3+,*

•! #+,#+&*26#.*;<=%>3?%
•! 9,-*@'63&*,($6/#*
    –! @'63&;>73*B*+1-2;1-,"-;A+,#C'""+$*(3-,,**
    –! &%'@3+<(36(1*%"*,#-$#+$*D3+*0%$*+-(.*-//36(-)%"*-"&*E?F*EC *
         •! 7-G*"++&*#%*7%&60G*$64.#,*#%*+>+('#+*H(.7%&*IJJK*
    –! +"#+$*L-"#M*-#*(%",%3+*0%$*#%/*3+8+3*%0*+-(.*,%'$(+*(%&+*0%3&+$*EC*
•! $-9,*9/.,&%
    –! ,+#*'/*(3-,,/-#.*HE?F*&+/+"&+"#K*
    –! (-33*N-8-(*26#.*+-(.*#+,#*(3-,,*
•! @<A%
    –! ($+-#+*-*"+2*-//36(-)%"*',6"4*,$(*0%3&+$,*0%$*+-(.*%0*#26:+$O*0-(+@%%1*P*6+*
    –! ,+#*'/*(3-,,/-#.*H!QR*&+/+"&+"#K*
    –! ,+#*'/*-//36(-)%"*/$%/+$)+,*-"&*$'"*+-(.*7-6"*7+#.%&*
9*36:3+*.%',+1++/6"4*;;;*                                              F+#'/*
C-9,*9/.,&%                                                            D9)&E..F%9,*%-G#H&(%IJ@$%
                                                                       •! Q%('7+"#-)%"S*
                                                                          –! 0-(+@%%1*T$-/.*9U!S**
                                                                                http://developers.facebook.com/docs
                                                                          –! #26:+$*9U!**
                                                                                http://apiwiki.twitter.com/Twitter-API-Documentation
                                                                       •! F64"*'/S*
                                                                          –! 0-(+@%%1S*http://www.facebook.com
                                                                          –! #26:+$S*https://twitter.com/signup
                                                                       •! V6@$-$6+,*
                                                                          –! C+,#WXS*http://restfb.com*
                                                                          –! #26:+$YNS*http://twitter4j.org/en




F+#'/*                                                                 F(+"-$6%*
 K9-2(9/%L9,+29+&%J(.)&$$#,+%9,*%@,D.(M9:.,%A'-(9):.,%                 NO>O%C.2-"%ID(#)9%P.(/*%Q20%5%M9-)"%$2MM9(#$9:.,%
                                                                       •!*Q'$6"4*#.+*^_[_*`%$3&*a'/*#%'$"-7+"#*6"*F%'#.*90$6(-O*
•!*E/+"ZVU*[;Y**]-8-*#%%316#*0%$*@'63&6"4*ZVU*-"&*!R*-//36(-)%",*     #26:+$*-"&*0-(+@%%1*2+$+*',+&*+>#+",68+3G*-,*-*&6,(',,6%"*
                                                                       @%-$&*0%$*0-",*#%*+>(.-"4+*6"0%$7-)%"*-"&*%/6"6%",*-@%'#*
    –! (%"#-6",*/$+<@'63#*3-"4'-4+*7%&+3,*#%*@+*',+&*@G*E/+"ZVU*0%$*
      3-"4'-4+*/$%(+,,6"4*
                                                                       7-#(.+,b*
   http://opennlp.sourceforge.net                                         –! *.'"&$+&,*%0*#.%',-"&,*%0*7+,,-4+,*2+$+*4+"+$-#+&*&-63G*
   http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/                 %"*#.+*#2%*,%(6-3*"+#2%$16"4*,6#+,b*
     exercise_rscs/ie_models_eng.zip                                      –! *-*3-$4+*/$%/%$)%"*%0*#.+,+*7+,,-4+,*&6,(',,*#.+*7-#(.*%0*
                                                                             #.+*&-Gb*
•!*]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#*H]9ACK*                       •!*2+*-$+*6"#+$+,#+&*6"*-"-3G,6"4*#.+,+*7+,,-4+,**
   http://www.dcs.shef.ac.uk/~ziqizhang/resources/tools/
     jatr_v1.0.zip
                                                                          –! #%*'"&+$,#-"&*2.-#*-$+*#.+*7%,#*/%/'3-$*#%/6(,*#.-#*
                                                                             6"#+$+,#*/+%/3+b*
F(+"-$6%*(%"#;*                                                  F(+"-$6%*9"-3G,6,*
NO>O%C.2-"%ID(#)9%P.(/*%Q20%5%M9-)"%$2MM9(#$9:.,%                8&7)90%D(.M%-"&%M.(,#,+%$&$$#.,%
•!*A%*&%*,%*2+*@'63#*-*c7-#(.*,'77-$6,-)%"d*-//36(-)%"*          •!*.%2*#%*6&+")0G*,/+(6D(*(%"#+"#*%0*6"#+$+,#*
   –!%#,02-*<*).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.*                –!*(%"#+"#*$+#$6+8-3*-"&*D3#+$6"4*
   –!%.2-02-*<*$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%#.-#*(-"*@+*   •!*.%2*#%*/$%(+,,*#.+*(%"#+"#*-"&*7-1+*,+",+*%0*6#*
   ',+&*#%*,'77-$6,+*(%$/',*(%"#+"#*                                –!*6"0%$7-)%"*+>#$-()%"**
                                                                    –!*"-#'$-3*3-"4'-4+*/$%(+,,6"4**
•!*e,6"4*#.+*+>#$-(#+&*#+$7,*2+*(-"*-"-3G,+*2.-#*.-,*@++"*
#.+*0%(',*%0*&6,(',,6%"*%0*#.+*7-#(.*%0*#.+*&-G*
•!*W%$*#.6,*8+$G*+>+$(6,+O*2+*,#'&G*#.+*7-#(.*@+#2++"*
                                      9*f"%23+&4+*
A,+/9,*%9,*%R&(M9,1%.,%-"&%NS-"%.D%;2,&%NO>O;*
                                   9(5'6,6)%"*/$%(+,,*




F(+"-$6%*9"-3G,6,*                                               a%$/',*T+"+$-)%"*
•!%@,02-T%).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.*                 •! R.9/S*($+-#+*-*(%$/',*%0*7+,,-4+,**
   –! *2+*"++&*#%*/6"</%6"#*$+3+8-"#*7+,,-4+,*%"*#26:+$*-"&*        –! #.-#* &6,(',,* #.+* 7-#(.* @+#2++"* R"43-"&* -"&* T+$7-"G*
      0-(+@%%1*                                                        %"*^I#.*]'"+*^_[_*
   –! %',6"4%-G#H&(%-"&%D9)&E..F%9U!,U%2+*-//3G*).,-&,-%         •! @,02-S*
      (&-(#&69/%9,*%V/-&(#,+*#%*@'63&*#.6,*(%$/',*
                                                    a%$/',*         –! #26:+$*9U!*/$%86&6"4*-((+,,*#%*#26:+$*&-#-*
                                                  4+"+$-)%"*        –! 0-(+@%%1*9U!*/$%86&6"4*-((+,,*#%*0-(+@%%1*&-#-*
                                                                    –! (%"#+"#* D3#+$6"4* /-$-7+#+$,* H#.+* R"43-"&<T+$7-"G*
•!%W2-02-T%$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%*                        7-#(.*%"*^I#.*]'"+*^_[_K*
   –! 2+*-//3G*@A%9,*%KLJ%%"*#.+*(%$/',*#%*-(.6+8+*#.6,*4%-3%    •! W2-02-%%
                                           a%"#+"#*                 –! (%$/',*%0*7+,,-4+,*$+3-#+&*#%*%"3G*#.+*7-#(.*%0*6"#+$+,#*
                                         -"-3G,6,*@G*!R*
a%$/',*T+"+$-)%"*',6"4*#26:+$*




                            -G#H&(%




                                                                       a%&+*6"S*ekaw-kasna_exercises/twitter
                                                                       R>#+$"-3*36@,S*lib/twitter4j-core-2.1.6-SNAPSHOT.jar |
                                                                                         log4j-1.2.15.jar




a%$/',*T+"+$-)%"*',6"4*#26:+$*                                        a%$/',*T+"+$-)%"*',6"4*#26:+$*
 A'3>%8AC!%IJ@%XI,9/1$#,+%-"&%02E/#)%:M&/#,&%$-9-2$Y%                  A'3>%8AC!%IJ@%
 •! U$%86&+,*7+#.%&,*0%$*0+#(.6"4*&-#-*$+3-#+&*#%S**                   •! 9"-3Gj+*#.+*,#$'(#'$+*-"&*(%"#+"#*%0*/'@36(*
    •! *A67+36"+,O*F#-#',O*e,+$,O*g+7@+$,O*,'@,($6@+$,O*0%33%2+$,O*       )7+36"+*,#-#',+,*
       ,%(6-3*4$-/.,*+#(;*                                                –! `.+$+*2-,*#.+*,#-#',*#2++#+&*0$%7k*
 –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%*                   –! `-,*6#*-*$+#2++#k
    -(#'-33G*&%*,%7+#.6"4h*<*R&6#*#.+*(3-,,S*
                                                         R>+$(6,+*
    ekaw.kasna.twitter.StatusTest
 •! C+0+$*#%*#.+*A26:+$Y]*N-8-&%(*#%*(%7/3+#+*#.+*+>+$(6,+,S*
    *http://twitter4j.org/en/javadoc/index.html

 !(1%#-%1.2($&/DS*$'"*C-9-2$!&$-3Z969%
a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                                                                   a%$/',*T+"+$-)%"*',6"4*#26:+$*
  A'3>%8AC!%IJ@%                                                                                                                   A'3>%8AC!%IJ@%
                                                                                                                                   •! 9",2+$
                          Twitter twitter = new                                                                                     try{
                          TwitterFactory().getInstance();                                                                             ResponseList<Status>publicTimeline = twitter.getPublicTimeline();
                                                                                                                                      //*TODO Complete exercise and analyse structure and content of each status
    try{                                                                                                                              GeoLocation geoLocation;
         //We request the public timeline, which returns a list of Status                                                             Place place;
      ResponseList<Status> publicTimeline = twitter.getPublicTimeline();                                                              while (it.hasNext()){
          /**                                                                                                                               Status st = it.next();
           * Complete this exercise and analyse the structure and content                                                                   log.info(st.getText());
  of each of the Status.                                                                                                                    log.info(st.getSource());
           * Have a look at the java doc of the Status Class, or just                                                                       if ((geoLocation = st.getGeoLocation()) != null)
  check the available methods in your IDE                                                                                                      log.info(geoLocation.toString());
           */                                                                                                                               if ((place = st.getPlace()) != null) {
       Iterator<Status> it = publicTimeline.iterator();                                                                                       log.info(place.getFullName());
                                                                                                                                              log.info(place.getBoundingBoxCoordinates().toString());
       while (it.hasNext()){
                                                                                                                                            }
           //TODO check what are the info you can get from a Status.
                                                                                                                                      }
       }
                                                                                                                                    } catch (TwitterException e){

  •! !(1%#-%1.2($&/DS*+&6#*-"&*$'"*C-9-2$!&$-3Z969%                                                                                 }
                                                                                                                                      e.printStackTrace();




a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                                                                   a%$/',*T+"+$-)%"*',6"4*#26:+$*
  A'3>%8AC!%IJ@%
                                                                                                                                 A'3N%C&9()"%IJ@%
  •! E'#/'#**)7+36"+*,#-#',
 ??????????!!??888888888 RT @nico_news: ???????????????????????????????????????? http://bit.ly/aZcvfl
 <a href="http://twipple.jp/" rel="nofollow">?????/twipple</a>
                                                                                                                                 •! 933%2,*6"#+$-()%"*26#.*#26:+$*
 Southampton v Tranmere: Preview followed by live coverage of Saturday's game between Southampton and Tranmere in L...
       http://bit.ly/9N802N
                                                                                                                                    $&9()"*-"&*-(&,*$*&-#-*
 <a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>
 Laper gueeee                                                                                                                       –! #%/*#%/6(,*#.-#*-$+*('$$+"#3G*#$+"&6"4*%"*
 <a href="http://www.snaptu.com" rel="nofollow">Snaptu.com</a>
 ?????????????????????????? / ??????????????????????????
                                                                                                                                       A26:+$*
                                                                                                                                 •! !#*+>/%,+,*#.+*0%33%26"4*7+#.%&,S**
 <a href="http://www.echofon.com/" rel="nofollow">Echofon</a>
 Changing the Language of Oppression http://bit.ly/aXA4w3 #specialneeds
 <a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>
 Are you attending the SuperSwarm at Jewel, Piccadilly tonight? Let's get an idea of numbers via my poll @ www.theprgeek.co.uk      –! ,+-$(.O**
       #superswarmLDN
 web                                                                                                                                –! #$+"&,O**
 Simon Cowell To Receive Special Emmy Award: October 7, 2010: Music mogul and former American Idol judge Simo... http://
       tinyurl.com/299o5gg                                                                                                          –! #$+"&,?('$$+"#O*#$+"&,?&-63GO*#$+"&,?
 <a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>
 "Wajahmu seperti bulan" --» ini artinya ngatain kan yah? Org bulan bolong2                                                            2++13G*
 <a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry®</a>
 FM????????????
 <a href="http://stone.com/Twittelator" rel="nofollow">Twittelator</a>
                                                                                                                                 •! A.+*F+-$(.*9U!*,'//%$#,*-7%"4*
 ???? [????:?????/????????????????????????]559 #colopl_msg
 <a href="http://t.colopl.jp/t/" rel="nofollow">Colotwi</a>
                                                                                                                                    %#.+$,O*#.+*0%33%26"4*%/+$-#%$,*0%$*
 pikiran saya cabangnya banyak, jd pusing sendiri..penuh rasanya ni kepala                                                          (%",#$'()"4*-*5'+$G*,#$6"4*
 <a href="http://m.tweete.net" rel="nofollow">m.tweete.net</a>...
a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                       a%$/',*T+"+$-)%"*',6"4*#26:+$*
 A'3[%C&9()"%IJ@%                                                                      A'3N%C&9()"%IJ@%
                                                                                       –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%*-(#'-33G*&%*
 $#,)&#*T%   F/+(6D+,*#.+*6&*%0*#.+*,#-#',*0$%7*2.6(.*#%*,#-$#*#.+*,+-$(.*               ,%7+#.6"4h*<*R&6#*#.+*(3-,,S*
 2,:/#*T%    F/+(6D+,*#.+*6&*%0*#.+*,#-#',*0$%7*2.6(.*#%*+"&*#.+*,+-$(.*
                                                                                             ekaw.kasna.twitter.QueryTest                                                                           R>+$(6,+*
 C#,)&T%      F#-#',+,*/$%&'(+&*,6"(+*-*,/+(6D+&*&-#+*H+;4;*^_[_<_l<[_K*
 B,:/T%                                                                                       Query query = new Query();
 V/-&(T/#,F$% C+#$6+8+,*#2++#,*26#.%'#*36"1,*                                                 query.query("football");

 D(.MT%       C+#$6+8+,*,#-#',+,*0$%7*-*468+"*',+$;*H+;4;*0$%7S*D0-K*                               //*TODO Modify the query object, and search for
 /9,+T%       C+#$6+8+,*,#-#',+,*6"*-*468+"*3-"4'-4+*                                         today's tweets (in english) related to football

 W8%          +;4;O*7+")%"6"4*g+>6(%*EC*W$-"(+*                                                     //*TODO Restrict your results to tweets generated
                                                                                              within 300 kilometers of Johannesburg, South Africa
 T%Y%         +;4;O*(%"#-6"6"4*0%%#@-33*26#.*-*/%,6)8+*-m#'&+*H+;4;*0%%#@-33*SK*K*
                                                                                                    // hint: use Query's geoCode method, the
 K&+9:.,%     +;4;O*7+")%"6"4*@++$*@'#*"%#*$%%#*                                              Kilometers unit is given as Query.KILOMETERS
                                                                                                    // hint: South Africa's lat: 26.12, long: 28.2
 C.2()&T%     +;4;O*a%"#-6"6"4*0%%#@-33*+"#+$+&*86-*A26:+$W++&*H+;4;*"+2,*
              ,%'$(+SA26:+$W++&K*                                                      •! !(1%#-%1.2($&/DS*$'"*]2&(1!&$-3Z969%




a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                       a%$/',*T+"+$-)%"*',6"4*#26:+$*
                                                                                       A'3>%8AC!%IJ@%
 A'3N%C&9()"%IJ@%
                                                                                       •! E'#/'#**5'+$G*$+5'+,#*0%$*L0%%#@-33M*"+-$*L]%.-""+,@'$4M
 –! I,$G&(%
                                                                                     hits:15
        Query query = new Query();                                                   MQMhlanzi:Total Football 360: Bafana Eager to Keep the Momentum of Winning! http://t.co/xOPTaY9
                                                                                     Benleeds:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or
        query.query("football");                                                           Morecambe or Dagenham and Redbridge?
                                                                                     Tumelo13:Gota admit I miss my NONstop #football convo's wit @Denisao_4 and @GordonTyler8! Haha talk bout nothing but the
                                                                                           #beautifulgame
              //*TODO Modify the query object, and search for                        Tumelo13:RT @Denisao_4: Ey bra @Tumelo13 that's not a sin! That's for the love of football! I approve wow! Let's hope it works :)??
        today's tweets related to football                                                 Amen
                                                                                     Edwardo84:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer
                                                                                     jonerz97:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Morecambe
              //*TODO Restrict your results to tweets generated                            or Dagenham and Redbridge?
        within 300 kilometers of Johannesburg, South Africa                          dcocker11:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer
                                                                                     AntimoOsato91:@siasduplessis Oros and The Dutch National Football Team could be good sponsors too! Haha :)
                                                                                     IsaacTeka:#football - EURO 2012 qualifier between Germany and Turkey is gonna be a fierce encounter. #Ozil and #Khedira
              // hint: use Query's geoCode method, the                               applenessuk:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer
                                                                                     johnyrotten:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or
        Kilometers unit is given as Query.KILOMETERS                                       Morecambe or Dagenham and Redbridge?
              // hint: Johannesburg’s lat: 26.12, long: 28.2                         kartikverma:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer
         query.geoCode(new GeoLocation(26.12,28.2),                                  RawRemedy:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or
                                                                                           Morecambe or Dagenham and Redbridge?
        30,Query.KILOMETERS);                                                        TLW1Dan:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer
                                                                                     jopayne:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Morecambe
                                                                                           or Dagenham and Redbridge?
a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                   a%$/',*T+"+$-)%"*',6"4*#26:+$*
 A'3[%C-(&9M%IJ@%                                                                 A'3[%C-(&9M%IJ@%
                                                                                  Twitter 4j allows you to retrieve streaming samples using the class
RestAPI and SearchAPI only present a limited snapshot of
                                                                                  TwitterStream. For the public timeline you just need basic
a timeline.              During the finals of the 2010 World Cup                  authentication.
                                 the rate of tweets containing the tags
                                 #Spain, #Netherlands, #Germany,                  [*** Create a TwitterStream instance
                                 #Uruguay, was quite high.                                twitterStream = new
                                                                                         TwitterStreamFactory(this).getInstance("yourAcc","yourPass");

                                 Two options:                                            Set a Listener for receiving the event of a status. Your listener should
                                                                                  ^*
                                 •! make requests, say, every 2sec                       implement the method public void onStatus(Status status)
                                  through the RestAPI or the Search API,
                                 •! BETTER:                                              twitterStream.setStatusListener(this);
                                      •! start listening to a stream of public
                                                                                  l***   Start Sampling
                                      tweets &
                                                                                         twitterStream.sample();
                                      •! filter according to the tag patterns
                                                                                  Y*     Do something with the tweet in your onStatus method




a%$/',*T+"+$-)%"*',6"4*#26:+$*                                                   a%$/',*T+"+$-)%"*',6"4*#26:+$*
 A'3[%C-(&9M%IJ@%                                                                 A'3[%C-(&9M%IJ@%
 –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%*-(#'-33G*&%*               –! I,$G&(
    ,%7+#.6"4h*<*R&6#*#.+*(3-,,S*
    ekaw.kasna.twitter.StreamTest                                                        private void startConsuming() throws TwitterException {
                                                                                            twitterStream.setStatusListener(this);

    private void startConsuming() throws TwitterException {                                 //*TODO Using TwitterStream’s filter method,
       twitterStream.setStatusListener(this);                                            restrict your sampling to collect tweets that include
                                                                                         the words: football, worldcup, final
       //*TODO Using TwitterStream’s filter method,
    restrict your sampling to collect tweets that include                                     String[] filterWords = {"#worldcup", "#WorldCup",
    the words: football, worldcup, final                                                 "#Worldcup", "#WORLDCUP"};
                                                                                                 twitterStream.setStatusListener(this);
        twitterStream.sample();                                                                  twitterStream.filter(0,null,filterWords);
    }                                                                                       twitterStream.sample();
                                                                                         }

 •! !(1%#-%1.2($&/DS*$'"*C-(&9M!&$-3Z969%
a%$/',*T+"+$-)%"*',6"4*#26:+$*                                       a%$/',*T+"+$-)%"*',6"4*#26:+$*
I**#:.,9/%A'&()#$&T%I2-"&,:)9:.,%                                    •! Try it yourself!
                                                                        •! Authenticating using Oauth
•!*$+,#$6()%",*#%*-((+,,6"4*/$68-#+*&-#-hhh*                                 •! OAuthTest.java
                                                                        •! Using the application “Ekaw-Kasna”
•!%Q^IKRAC%CAJ%NO>O**                                                   •! Login with your twitter account and go to:
     •!*(.-"4+*#%*-'#.+")(-)%"*7%&+*0%$*$+#$6+86"4*6"&686&'-3,M*        http://twitter.com/apps/new
     ,#-#',*6"0%$7-)%"*
     •!0$%7*-*,67/3+*',+$"-7+</-,,2%$&*#%S*
          •! W92-"7E9$&*%92-"&,:)9:.,*%0*$+46,#+$+&*c-//36(-)%",d*




a%$/',*T+"+$-)%"*',6"4*#26:+$*                                       a%$/',*T+"+$-)%"*',6"4*#26:+$*
                                                                       •! I2-"&,:)9:,+%2$#,+%W92-"%
                                                                          –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z*
                                                                             •! +"#+$*#.+*eCV*-#*#.+*(%",%3+*6"*-*2+@*@$%2,+$*
                                                                             •! #%*%@#-6"*-"*%-'#.=#%1+"*
                                        i%'*2633*"++&*#.+,+*#2%*
                                       ,#$6"4,*0%$*-'#.+")(-)"4**




                                                                                                                i%'*2633*@+*4686"4*
                                                                                                               -'#.%$6j-)%"*#%*#.6,*
                                                                                                               -//36(-)%"*#%*-((+,,*
                                                                                                                 G%'$*6"0%$7-)%"*
a%$/',*T+"+$-)%"*',6"4*#26:+$*                                 a%$/',*T+"+$-)%"*',6"4*#26:+$*
 •! I2-"&,:)9:,+%2$#,+%W92-"%                                   •! I2-"&,:)9:,+%2$#,+%W92-"%
    –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z*                         –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z*
       •! +"#+$*#.+*eCV*#%*%@#-6"*-"*%-'#.=#%1+"**                    •! +"#+$*#.+*eCV*#%*%@#-6"*-"*%-'#.=#%1+"**
                                                                   –! E"(+*G%'*c933%2d*-'#.%$6j-)%"*G%'*2633*@+*/$%86&+&*
    –! E"(+*G%'*c933%2d*-'#.%$6j-)%"*G%'*2633*@+*/$%86&+&*            26#.*#.+*U!ZS*
       26#.*-*U!ZS*                                                –! R"#+$*#.+*U!Z*#%*(%7/3+#+*-'#.+")(-)%"*
                                            A.6,*6,*#.+*U!Z*       ciEe*9CR*9eAnRZA!a9ARQhhd*
                                              "++&+&*#%*
                                             (%7/3+#+*#.+*
                                            -'#.+")(-)%"*




                                                               a%$/',*T+"+$-)%"*',6"4*0-(+@%%1*




                          D9)&E..F%




                                                                a%&+*6"S*ekaw-kasna_exercises/facebook
                                                                R>#+$"-3*36@,S*lib/restfb-1.5.3.jar | log4j-1.2.15.jar
0-(+@%%1*9U!**W+#(.6"4*E@N+(#,*                                 0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-*
 •! The Graph API                                                 https://graph.facebook.com/facebook
     •! provides facilities for reading and writing data to
     facebook

 •! Each API request starts with the URL:
    https://graph.facebook.com

 •! e.g., data about any object can be found by fetching
    https://graph.facebook.com/objectID
   - objectID is the unique id of this object in the social
 graph

   - e.g., the unique id for a page is its name:
    https://graph.facebook.com/facebook




0-(+@%%1*9U!**a%""+()%",*                                       0-(+@%%1*9U!**a%""+()%",*

 •! All objects in the facebook social graph are connected via
 relationships (connections)

 •! Fetch connections
    https://graph.facebook.com/objectID/connection_type


 •! e.g., the page’s own posts
    https://graph.facebook.com/facebook/posts
0-(+@%%1*9U!**U-4+*a%""+()%",*                                                              0-(+@%%1*9U!**W63#+$6"4*Q-#-*

D&&*%            A.+*/-4+M,*2-33*                                                             •! Data can be filtered using parameters
0#)-2(&%         A.+*/-4+M,*/$%D3+*/6(#'$+*                                                       •! e.g.,
-9++&*%          A.+*/.%#%,O*86&+%,O*-"&*/%,#,*6"*2.6(.*#.6,*/-4+*.-,*@++"*#-44+&*                  -! since, until ---> specify date ranges
/#,F$%           A.+*/-4+o,*/%,#+&*36"1,*
                                                                                                    -! limit ---> specify amount of returned data
0".-.$%          A.+*/.%#%,*#.6,*/-4+*.-,*'/3%-&+&*
+(.20$%          A.+*4$%'/,*#.6,*/-4+*6,*-*7+7@+$*%0*
9/E2M$_6#*&.$%   A.+*/.%#%*-3@'7,?86&+%,**#.6,*/-4+*.-,*($+-#+&*
                                                                                                 •! e.g., fetching the feed
$-9-2$&$%        A.+*/-4+o,*,#-#',*'/&-#+,*
                                                                                                      -! within specified dates and
,.-&$%           A.+*/-4+o,*"%#+,*                                                                    -! with a limit of 50
0.$-$%           A.+*/-4+o,*%2"*/%,#,*
                                                                                                 https://graph.facebook.com/worldcup/feed?
                                                                                              since=2010-07-17&until=2010-07-20&limit=50
M&ME&($%         A.+*/-4+o,*7+7@+$,;*i%'*(-"*%"3G*5'+$G*'/*#%*J__*7+7@+$,;*!#*6,*"%#*
                 /%,,6@3+*#%*6#+$-#+*#.$%'4.*#.+*36,#;*R>-7/3+S*.:/,S??4$-/.;0-(+@%%1;(%7?
                 pU9TR=!Qq?7+7@+$,k3676#rJ__*
&6&,-$%          A.+*+8+"#,*#.6,*/-4+*6,*-:+"&6"4*
)"&)F#,$%        a.+(16",*7-&+*@G*0$6+"&,*%0*#.+*('$$+"#*,+,,6%"*',+$*




0-(+@%%1*9U!**W63#+$6"4*Q-#-*                                                               0-(+@%%1*9U!**W6"&6"4*E@N+(#,**

                                                                                              •! Search for objects
                                                                                                 https://graph.facebook.com/search?
                                                                                              q=query&type=objectType


                                                     c($+-#+&=)7+d*6,*26#.6"*                    - query ---> what you want to find
                                                     #.+*,/+(6D+&*&-#+*$-"4+,*
                                                                                                 - objectType ---> type of the object (e.g.
                                                                                              facebook post, user)

                                                                                              •! e.g., search all public posts for “2010 world cup”
                                                                                                   https://graph.facebook.com/search?q=2010%20world
                                                                                              %20cup&type=post
0-(+@%%1*9U!**W6"&6"4*E@N+(#,**                         0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+*

                                                          Try it yourself!

                                                          •! Fetch the data about the page worldcup

                                                          •! Get the feed of this page (hint: connection is feed)
                                                              •! this is the wall for the page worldcup

                                                          •! Return only the first 5 messages of this feed
  U%,#,*(%"#-6"6"4*#.+*#+$7,**
   c^_[_d*B*c2%$3&d*B*c('/d*
                                                          •! Search for all pages containing worldcup in the
                                                          page name




0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+*                       0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+*
 •! ANSWERS                                               •! ANSWERS
     •! page worldcup:                                        •! Get the feed (wall) of the page worldcup:
                                                             https://graph.facebook.com/worldcup/feed
          •! fetch https://graph.facebook.com/worldcup
0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+*                                   0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+*
 •! ANSWERS                                                           •! ANSWERS
     •! Return only the first 5 messages of the feed:                     •! Search for all pages containing worldcupin the
    https://graph.facebook.com/worldcup/feed&limit=5
                                                                             page name
                                                                             https://graph.facebook.com/search?q=worldcup&type=page




a36+"#*V6@$-$6+,*                                                    C+,#WX*9U!**`%$3&*a'/*F(+"-$6%**

 •! Multiple client libraries for facebook API                        •! Exercise:
    http://developers.facebook.com/search?                                get the messages sent on the day of the
 q=User:Client_Libraries
                                                                          England-Germany match - 27th of June 2010
    •! RestFB client library was the first java library to support
                                                                      [***   Search for all pages containing “worldcup”
    the GraphAPI
    •! Other Java libraries now supporting GraphAPI
                                                                      ^*     For every page:
        - BatchFB
                                                                              •! Get the messages posted on that day
        - TinyFBGraphClient
                                                                              •! Store the messages to generate your corpus
        - facebook Java Webapp

    •!We use the RestFB client library in this tutorial
C+,#WX*9U!**Q+0-'3#0-(+@%%1a36+"#**                                                C+,#WX*9U!**F+-$(.6"4*

 •! DefaultfacebookClient                                                            •! Step 1:
     •! provides methods for reading and writing data
                                                                                             Connection<T>
     to facebook graph                                                                       fetchConnection(String connection,
                                                                                             Class<T> connectionType,
 FacebookClient facebookClient                                                               Parameter... parameters)
 = new DefaultfacebookClient();                        9((+,,*/'@36(*&-#-*
                                                                                    facebookClient facebookClient = new DefaultfacebookClient();

 facebookClient = new                                                               Connection<Page> pageSearch =
 DefaultfacebookClient(ACCESS_TOKEN);                                               facebookClient.fetchConnection("search",Page.class,
                                                                                       Parameter.with("q", "world cup"), Parameter.with("type",
                                                                                       "page"), Parameter.with("limit", "10"));
                                C+5'6$+&*#%*-((+,,*/$68-#+*
                                 &-#-*%$*+&6#?/'@36,.*&-#-*
                                                                                                                      .:/,S??4$-/.;0-(+@%%1;(%7?,+-$(.k5r2%$3&
                                                                                                                              B('/P#G/+r/-4+P3676#r[_*




C+,#WX*9U!**F+-$(.6"4*                                                             C+,#WX*9U!**$+#'$"*0$%7*$+5'+,#*<*/-4+,*
•! $+#'$",*-*36,#*%0*#.+*D$,#*[_*/-4+,*-@%'#*c2%$3&('/d*                            •! World Cup Pages
•! W%$*+-(.*/-4+O*/$%/+$)+,*$+#'$"+&*6"(3'&+S*
                                                                                     K9M&%                Q9-&+.(1%                 @<%
    –! 6&O*"-7+O*(-#+4%$GO*0++&O*/6(#'$+,*b                                          `%$3&*a'/*           U%36)(6-",*               J_tY[_YulvI*
                                                                                     `%$3&*a'/*           U$%&'(#,=%#.+$*           [lJJJvYvuItt^lu*
                         4+#Q-#-*<<s*$+#'$",*-*36,#*%0*%@N+(#,*H&+/+"&6"4*%"*#.+*    2%$3&*('/*           F/%$#,=-#.3+)(,*          [lY[Ivl_l^vv_vl*
                                         (%""+()%"*$+5'+,#+&K*
                                                                                     `%$3&*a'/*^_[_*      U$%&'(#,=%#.+$*           ^JIvvtYItvvv*
                                                                                     C'4@G*`%$3&*a'/*     F/%$#,=-#.3+)(,*          [[v^Iv^l^[Il*
 for (Page page : pageSearch.getData()) {                                            ^_[_*`%$3&*a'/*      e"1"%2"*                  [^J_YtltY^_^^tJ*
     System.out.print("Name: " + page.getName());                                    w`ECVQ*aeUd*         a3'@,*                    [^lvttYI^[Iv*
     System.out.print("Category: " + page.getCategory());                            `%$3&*a'/*%"*RFUZ*   F/%$#,=-#.3+)(,*          [v[Jl[lt^_Y_*
     System.out.println("ID: " + page.getId());
                                                                                     `ECVQ*aeU*           F/%$#,=#+-7,*             [^_l_IlvYvv[_Jv*
 }*
                                                                                     ^_[_*`%$3&*a'/*      V%(-3=@',6"+,,*           lvI[[uIIlt[v*
C+,#WX*9U!**R>+$(6,+*                                        C+,#WX*9U!**R>+$(6,+*

 Try it yourself!                                              ANSWERS
                                                               Connection<Group> groupSearch =
 •! Edit the class SearchTest.java                                facebookClient.fetchConnection(
                                                                        "search", Group.class,
                                                                        Parameter.with("q", "2010 world cup"),
 •! Search for all groups talking about a topic of                      Parameter.with("type", "group"),
                                                                        Parameter.with("limit", "15"));
 interest to you
 •! Get the first 15 groups                                    for (Group group : groupSearch.getData()) {
                                                                  System.out.println("Name: " + group.getName());
 •! For every group:                                              System.out.println("ID: " + group.getId());
                                                               }
      - print name and ID




C+,#WX*9U!**$+#'$"*0$%7*$+5'+,#*<*4$%'/,*                    C+,#WX*9U!**T+m"4*#.+*0++&*
‘2010 world cup’ groups
  K9M&%                                    @<%
                                                               •! Step 2:
  kkkkkkk**x-7-3+1*Ey(6-3*T$%'/*           ^^JJ^[YItu[J*
  ^_[_*W!W9*`ECVQ*aeU*                     [^Y[Iulu_uJ[YJv*
                                                                     Connection<T>
                                                                     fetchConnection(String connection,
  ^_[_*W!W9*`%$3&*a'/*                     ^^_YtlvIYJ*
                                                                     Class<T> connectionType,
  ^_[_*W!W9*`ECVQ*aeU*FEeAn*9WC!a9*        ^I_Ilt[tYJI*
                                                                     Parameter... parameters)
  ^_[_*W60-*`%$3&*a'/*F%'#.*90$61-*        [^_uIl^[[^II[Ju*
  ^_[_*W!W9*`%$3&*a'/*F%'#.*90$6(-*        [[[I_tJvJJ[YYlv*
  ^_[_*W60-*`%$3&*a'/*Q$6"16"4*T-7+*       ^lv[^t[ut_^u*       Connection<Post> myFeed = facebookClient.fetchConnection(
  ^_[_*W!W9*`ECVQ*aeU*FEeAn*9WC!a9*        [_tJ^t^u^J[Jlt_*       "worldcup/feed", Post.class, Parameter.with("since",
  g'"&6-3*^_[_*F'&-0$6(-*^_[_*`%$3&*('/*   [uuv^tvtIlvl*          "2010-06-27T11:00:00"), Parameter.with("until",
                                                                  "2010-06-28T17:00:00"), Parameter.with("limit", "10"));
  !#-36-*<*^_[_*W!W9*`%$3&*a'/*            [tJYlYIlt^^*
  ^_[_<W!W9<`%$3&<a'/*                     [^vlIIll_I[^uIl*
  ^_[_*`%$3&*a'/**                         [[^_uJ^JttlJYYu*
  ^_[_*`%$3&*a'/*                          [ulll^l[vlIl*                                 .:/,S??4$-/.;0-(+@%%1;(%7?2%$3&('/?0++&k
  ^_[_*W!W9*`%$3&*a'/*                     [l_YvttuvuvJYII*                             ,6"(+r^_[_<_v<^IP'")3r^_[_<_v<^tP3676#r^_*
  ^_[_*W!W9*`%$3&*a'/*                     [vl[Y_tt[uIt*
CRFA*9U!**T+m"4*#.+*0++&*                                                                                 CRFA*9U!**$+#'$"*0$%7*$+5'+,#*<*0++&*
                                                                                                           Try it yourself! - ConnectionsTest.java
•! 0++&*$+#'$",*-33*/%,#,*2$6:+"*%"*#.+*,/+(6D+&*&-#+*
                                                                                                             •!    Message: the english were hoping to play penalties what a waste of their
•! W%$*+-(.*/%,#*-:$6@'#+,*$+#'$"+&*6"(3'&+S*                                                                      training time
    –! ($+-)%"*)7+O*/%,#*"-7+O*&+,($6/)%"b;*                                                                                 Creation Time: Sun Jun 27 17:45:13 BST 2010
                                                                                                             •!    Message: Deutschland, Deutschland über alles, über alles in der Welt
                                                                                                                             Creation Time: Sun Jun 27 17:29:25 BST 2010
                                                                                                             •!    Message: world cup?? this wasn't a 'football games' but 'fakeball' games!!
 for (Post post : myFeed.getData()) {                                                                              Lampard was scored but the referee was blind....4-1?? congrats to the
                                                                                                                   referees coz they have a massive party tonite to celebrate!! $$$$$$$$$$$$$
    System.out.println("Message: " + post.getMessage());                                                           $$$ wow.... even can makes people blind!!! world cup??? **** off!!!
    System.out.println("tCreation Time" +                                                                                   Creation Time: Sun Jun 27 17:25:32 BST 2010
        post.getCreatedTime());
                                                                                                             •!    Message: how are we suppose to be patriotic with a team that plays like
    }*                                                                                                             that, none of them deserve the money they get, waste of time..............
                                                                                                                             Creation Time: Sun Jun 27 16:48:06 BST 2010
                                                                                                             •!    Message: john terry on england should get worst defender for the year...he's
                                                                                                                   no good
                                                                                                                             Creation Time: Sun Jun 27 16:42:39 BST 2010




CRFA*9U!**U%,#*U$%/+$)+,O*a%""+()%",*                                                                     a%$/',*T+"+$-)%"*',6"4*0-(+@%%1*
 Properties                                                                                                I**#:.,9/%A'&()#$&T%I2-"&,:)9:.,%
#*%                  A.+*/%,#*!Q*                                                                          •!*$+,#$6()%",*#%*-((+,,6"4*/$68-#+*&-#-hhh*
D(.M%                9"*%@N+(#*(%"#-6"6"4*#.+*!Q*-"&*"-7+*%0*#.+*',+$*2.%*/%,#+&*#.+*7+,,-4+*
                                                                                                                  •!*9((+,,*A%1+"*$+5'6$+&*0%$*,%7+*7+#.%&,*
-.%                  9*36,#*%0*#.+*/$%D3+,*7+")%"+&*%$*#-$4+#+&*6"*#.6,*/%,#*
M&$$9+&%             A.+*7+,,-4+*                                                                                       •!#%*/$+8+"#*-((+,,*H$+-&*%$*2$6#+K*#%*/$68-#+*&-#-*
0#)-2(&%             !0*-8-63-@3+O*-*36"1*#%*#.+*/6(#'$+*6"(3'&+&*26#.*#.6,*/%,#*                                       •!+;4;O*/'@36,.6"4*#%*#.+*0-(+@%%1*,%(6-3*4$-/.*
/#,F%                A.+*36"1*-:-(.+&*#%*#.6,*/%,#*                                                               •!*X6&&6"4#%"*/$%86&+,*-*4%%&*+>/3-"-)%"*0%$*4+m"4*-((+,,*#%1+",*-#S*
,9M&%                A.+*"-7+*%0*#.+*36"1*                                                                 http://benbiddington.wordpress.com/2010/04/23/facebook-graph-
)90:.,_*&$)(#0:.,%   A.+*(-/)%"?&+,($6/)%"**%0*#.+*36"1*H-//+-$,*@+"+-#.*#.+*36"1*"-7+K*                   api-getting-access-tokens
$.2()&%              !0*-8-63-@3+O*#.+*,%'$(+*36"1*-:-(.+&*#%*#.6,*/%,#*H0%$*+;4;O*-*z-,.*%$*86&+%*D3+K*
#).,%                9*36"1*#%*-"*6(%"*$+/$+,+")"4*#.+*#G/+*%0*#.6,*/%,#*
9H(#E2:.,%           9*,#$6"4*6"&6(-)"4*2.6(.*-//36(-)%"*2-,*',+&*#%*($+-#+*#.6,*/%,#*
                                                                                                           •!*+;4;O*0+#(.*#.+*0$6+"&,*%0*',+$*L1.-&6N-;+3@+&2+6.GM*
9):.,$%              9*36,#*%0*-8-63-@3+*-()%"*"-7+,*-"&*36"1,*H6"(3'&6"4*(%77+")"4O*3616"4*-"&*-"*             •!*#.6,*$+5'6$+,*-'#.+")(-)%"**#%1+"*L>>`a`bO``O;;;M*
                     %/)%"-3*-//<,/+(6D+&*-()%"K*                                                                     https://graph.facebook.com/khadija.elbedweihy/
/#F&$%               A.+*"'7@+$*%0*361+,*%"*#.6,*/%,#*
                                                                                                                      friends&access_token=11585905509...
)(&9-&*:M&%         A.+*)7+*#.+*/%,#*2-,*6"6)-33G*/'@36,.+&*
20*9-&*:M&%         A.+*)7+*%0*#.+*3-,#*(%77+"#*%"*#.6,*/%,#*                 933*/$%/+$)+,*P*
                                                                             (%""+()%",*%0*-*              •!%!(1%#-%1.2($&/D3;;;*
 Connections
                                                                                 cU%,#d*
).MM&,-$%            933*%0*#.+*(%77+"#,*%"*#.6,*/%,#*
0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-*                      0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-*
 https://graph.facebook.com/khadija.elbedweihy
                                                         •! fetch specific fields
                                                         https://graph.facebook.com/khadija.elbedweihy?
                                                         fields=id,name,picture




                              U'@36(*Q-#-*%"3G*

                                                           V6"1*#%*#.+*
                                                             /6(#'$+*

                                                                                      U6(#'$+*-#*#.+*
                                                                                        468+"*36"1*




0-(+@%%1*9U!**9'#.%$6j-)%"*R>-7/3+*                    0-(+@%%1*9U!**9'#.%$6j-)%"*R>-7/3+*




                                  9((+,,*#%1+"*2%$1,*
                                  0%$*#.+*-'#.%$6j+&*
                                       ',+$*%"3G**


                                                                                          F-7+*-((+,,*#%1+"*0%$*-*
                                                                                          &6{+$+"#*',+$*c*.&$%,.-%
                                                                                                  G.(Fd*
0-(+@%%1*9U!**e,+$*W6+3&,*                                                           0-(+@%%1*9U!**e,+$*a%""+()%",*
 #*T%                   A.+*',+$M,*!Q*
                                                                                       ".M&T%                A.+*',+$M,*Z+2,*W++&;*C+5'6$+,*#.+*read_stream*/+$76,,6%"*
 V($-,9M&T%            A.+*',+$M,*D$,#*"-7+*
                                                                                       D&&*T%                A.+*',+$M,*2-33;*C+5'6$+,*#.+*read_stream /+$76,,6%"*#%*,++*
 /9$-,9M&T%            A.+*',+$M,*3-,#*"-7+*                                                                "%"</'@36(*/%,#,;*
 ,9M&T%                 A.+*',+$M,*0'33*"-7+*                                          -9++&*T%              A.+*/.%#%,O*86&+%,O*-"&*/%,#,*6"*2.6(.*#.6,*',+$*.-,*@++"*
 9E.2-%                 A.+*',+$M,*@3'$@*#.-#*-//+-$,*'"&+$*#.+6$*/$%D3+*/6(#'$+*                            #-44+&;*C+5'6$+,*#.+*read_stream /+$76,,6%";*
 E#(-"*91%              A.+*',+$M,*@6$#.&-G*                                           0.$-$T%               A.+*',+$M,*%2"*/%,#,;*C+5'6$+,*#.+*read_stream /+$76,,6%"*
 G.(F_&*2)9:.,%         9*36,#*%0*#.+*2%$1?+&'(-)%"*.6,#%$G*0$%7*#.+*',+$M,*/$%D3+*                          #%*,++*"%"</'@36(*/%,#,;*

 &M9#/T%                A.+*/$%>6+&*%$*(%"#-(#*+7-63*-&&$+,,*4$-"#+&*@G*#.+*',+$*      0#)-2(&T%             A.+*',+$M,*/$%D3+*/6(#'$+*

 G&E$#-&%               9*36"1*#%*#.+*',+$M,*/+$,%"-3*2+@,6#+*                         D(#&,*$T%             A.+*',+$M,*0$6+"&,*

 ".M&-.G,%              A.+*',+$M,*.%7+#%2"*                                           9):6#:&$_#,-&(&$-$_   A.+*-()86)+,?6"#+$+,#,?7',6(?@%%1,?7%86+,?#+3+86,6%"*36,#+&*%"*
                                                                                       M2$#)_E..F$_          #.+*',+$M,*/$%D3+*
 /.)9:.,%               A.+*',+$M,*('$$+"#*3%(-)%"*                                    M.6#&$_-&/&6#$#.,T%
 +&,*&(%                A.+*',+$M,*4+"&+$*                                             /#F&$T%               933*#.+*/-4+,*#.6,*',+$*.-,*L361+&M;*C+5'6$+,*#.+ user_likes %$*
 #,-&(&$-&*#,%         T+"&+$,*#.+*',+$*6,*6"#+$+,#+&*6"*                                                   0riend_likes*/+$76,,6%";*
 M&&:,+D.(%            AG/+,*%0*$+3-)%",.6/,*#.+*',+$*6,*,++16"4*                     0".-.$T%              A.+*/.%#%,*#.6,*',+$*6,*#-44+&*6";*C+5'6$+,*#.+*
                                                                                                             user_photo_video_tagsO*friend_photo_video_tag,*-"&*
 (&/9:.,$"#0$-9-2$%    A.+*',+$M,*$+3-)%",.6/*,#-#',*
                                                                                                             user_photos*%$ friend_photos*/+$76,,6%",;*
 (&/#+#.,%              A.+*',+$M,*$+3646%"*




                                                                                      F(+"-$6%*9"-3G,6,*
                                                                                      @,02-T%-*).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.*
                                                                                           –!*2+*"++&*#%*/6"</%6"#*$+3+8-"#*7+,,-4+,*%"*#26:+$*-"&*
                                                                                           0-(+@%%1*
                                                                                           –!%2$#,+%-G#H&(%9,*%D9)&E..F%IJ@U%2+*-//3G*).,-&,-%
                                                                                           (&-(#&69/%9,*%V/-&(#,+*#%*@'63&*#.6,*(%$/',*
                       #,D.(M9:.,%&'-(9):.,%                                                                                                                a%$/',*
                                                                                                                                                          4+"+$-)%"*

                                                                                      W2-02-T%-*$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%*
                                                                                           –!*2+*-//3G*@A%9,*%KLJ%%"*#.+*(%$/',*#%*-(.6+8+*#.6,*4%-3%

                                                                                                                                              a%"#+"#*
                                                                                                                                            -"-3G,6,*@G*!R*
a%"#+"#*9"-3G,6,*86-*!R*                                    a%"#+"#*9"-3G,6,*86-*!R*

 •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U%     •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U%
    G&%D.//.G%-"&$&%$-&0$%                                      G&%D.//.G%-"&$&%$-&0$T*
     –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+*              –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+*
       •! A%1+"6,-)%"*                                             H#%1+"6,-)%"O*UEF*#-446"4K*        E/+"ZVU*
       •! UEF*#-446"4*                                          –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#*
    –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#**            H/.$-,+*(.'"16"4O*+")#G*$+(%4"6)%"K*
       •! /.$-,+*(.'"16"4*
    –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"**               –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"*H#+$7*
       •! #+$7*$+(%4"6)%"*
                                                                   $+(%4"6)%"K*
                                                                                           ]9AC*




a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*               a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
•! R.9/S*/$%(+,,*"-#'$-3*3-"4'-4+*#+>#*,'(.*#.-#*,/+(6D(*   •! I,%&'9M0/&S*H3%(-#+&*6"*c&-#-?+>-7/3+,?
   6"0%$7-)%"*(-"*@+*6&+")D+&*                                 +>-7/3+[;#>#dK*
   –! A.+,+*/$%(+,,+,*6"(3'&+*
      •! F+"#+"(+*,+47+"#-)%"*                                c8..,&1%D9#/$%-.%&,*%+.9/%*(.2+"-3%P91,&%8..,&1e$%
      •! A%1+"6,-)%"*                                               -(#0%-.%C.2-"%ID(#)9%NO>O%E&+9,%G#-"%"#+"%
      •! U-$#*%0*F/++(.*#-446"4*                                 &'0&)-9:.,$%E2-%"&%/&96&$%G#-".2-%9%$#,+/&%+.9/%
•! @,02-*                                                       $).(&*%9f&(%-"(&&%+(.20%M9-)"&$%9,*%9%>7g%*&D&9-%
    –! -*,6"43+*7+,,-4+*                                                          -.%R&(M9,13d*
•! W2-02-*
    –! -*,+5'+"(+*%0*UEF*#-44+&*#%1+",*
a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                 a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
 •! C&,-&,)&%$&+M&,-9:.,%                                       •! C&,-&,)&%$&+M&,-9:.,%2$#,+%W0&,KLJ*
    –! @,02-S*-*,6"43+*7+,,-4+*
                                                              /* Input */ (LINE 17)
    –! W2-02-S*-*36,#*%0*,+"#+"(+,*                           String pathToInput = "../../data/examples/example1.txt";
                                                              String content = "…";

 Rooney fails to end goal drought. | Wayne Rooney's trip to   /* Creates an object of OpenNLP sentence segmentation detector */
 South Africa 2010 began with high expectations but he        SentenceDetector detector = new SentenceDetector("lib/opennlp/models/
                                                                 EnglishSD.bin.gz");
 leaves without a single goal scored after three group
 matches and a 1-4 defeat to Germany.                         /* Call the actual method to identify the end offsets of sentences. */
                                                              int[] result = detector.sentPosDetect(content);

                                                              /* Print out the sentences */ Rooney fails to end goal drought. Wayne Rooney's
 Try it yourself! <*F+"#+"(+F+47+"#-)%";N-8-**                int start=0, i=0;              trip to South Africa 2010 began with high
                                                                                             expectations but he leaves without a single goal
                                                              do {                           scored after three group matches and a 1-4 defeat
                                                              ……                             to Germany.
                                                              } while(start<result[result.length-1]);




a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                 a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
 •! !.F&,#$9:.,%                                                •! !.F&,#$9:.,%2$#,+%W0&,KLJ
    –! !"/'#S*-*,6"43+*,+"#+"(+O*%$*7+,,-4+*                    /* Input text message */ (LINE 28)
                                                                String content = "…" // read in the text content from "example1.txt"
    –! E'#/'#S*-*36,#*%0*#%1+",*
                                                                List<String> sentences = new ArrayList<String>();
                                                                ……
                                                                /* Code for splitting sentences */

  Rooney fails to end goal drought                              /*Creates an object of OpenNLPtokeniser using a pre-built English language
                                                                   model. */
                                                                //change the path accordingly
                                                                String pathToEngTokenisationModel = "lib/opennlp/models/EnglishTok.bin.gz";
  Rooney, fails, to, end, goal, drought, .                      Tokenizertokeniser tokeniser = new Tokenizer(pathToEngTokenisationModel);

                                                                /*Tokenise each sentence and print out the result*/

 Try it yourself! <*A%1+"6,-)%";N-8-**                          for(String sentence: sentences){
                                                                    String[] result=tokeniser.tokenize(sentence);
                                                                    for(String tok:result)
                                                                     System.out.println(tok); Rooney fails to end        goal drought.
                                                                 }
a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                    a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
 •! J9(-%.D%$0&&)"%-9++#,+%                                        •! JWC%-9++#,+%2$#,+%W0&,KLJ*
                                                                 /*Input text message*/ (LINE 31)
    –! @,02-S*-*36,#*%0*#%1+",*                                  String content = "…" //read in the text content from example1.txt

    –! W2-02-S*-*36,#*%0*#%1+",*26#.*#.+6$*/-$#*%0*,/++(.*#-4*   List<String> tokens = new ArrayList<String>();
                                                                 /* Code for tokenisation and add the result into the list object above.
                                                                 You do not need to do sentence segmentation in this case. Because the
                                                                    tokenisation will detect sentence boundary as a first step*/

  Rooney, fails, to, end, goal, drought, .
                                                                 /*Creates an object of OpenNLP POS tagger using a pre-built English
                                                                    language model.*/
                                                                  //change the path accordingly
  Rooney/NNP fails/VBZ to/TO end/VB goal/NN drought/             String pathToEngPOSModel = "lib/opennlp/models/tag.bin.gz";
                                                                  /* You MAY specify additionally two parameters for the constructor, i.e.,
  NN ./.                                                            TagDicionary and Dictionary.*/
                                                                 PosTagger tagger = new PosTagger(pathToEngPOSModel, (Dictionary)null);

 Try it yourself! <*UEFA-44+$;N-8-**                             /*Tag the list of tokens and print out the result*/
                                                                 String[] result=tagger.tag(tokens.toArray(new String[0])); goal/NN
                                                                               Rooney/NNP fails/VBZ to/TO end/VB
                                                                               drought/NN ./.
                                                                 for (String tag: result)
                                                                    System.out.println(tag);




a%"#+"#*9"-3G,6,**U.$-,+*a.'"16"4*                              a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
 •! R.9/S*6&+")0G6"4*6"0%$7-)%"*'"6#,*#.-#*7-1+*4%%&*              •! J"(9$&%)"2,F#,+%
    (-"&6&-#+*#+$7,*%0*%'$*6"#+$+,#*                                   –! @,02-S*-*36,#*%0*JWC7-9++&*%-.F&,$%
 •! !"*#.6,*+>+$(6,+O*2+*0%(',*%"*,.2,%0"(9$&$%                        –! W2-02-S*-*36,#*%0*/.$-,+,*H"%'",?8+$@*/.$-,+,K*
     –! 2.6(.*%|+"*@+-$*67/%$#-"#*&%7-6"<,/+(6D(*
        6"0%$7-)%"*                                                  Rooney/NNP fails/VBZ to/TO end/VB goal/NN drought/
                                                                     NN ./.
 •! @,02-*
     –! UEF<#-44+&*#%1+",*
 •! W2-02-*                                                          Rooney, goal drought
     –! Z%'"*/.$-,+,*
                                                                                                                   R>+$(6,+*
                                                                   Try it yourself!
                                                                     %*+&6#*#.+*(3-,,*U.$-,+a.'"1+$;N-8-*-"&*$'"*
a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                                         a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
                                                                                      •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ*
  •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ*                                                   (LINE 44 in PhraseChunker.java)
                                                                                      int[] result = detector.sentPosDetect(content);
                                                                                      int start = 0, i = 0;
  (LINE 32 in PhraseChunker.java)
                                                                                      do {
  //initilising all required NLP processors, If you get an out of memory
                                                                                            //sentence splitting
  //exception, try increasing your JVM heap space to at least 256MB
                                                                                            String sentence = content.substring(start, result[i]);
  String pathToEngTokenisationModel = "lib/opennlp/models/EnglishTok.bin.gz";
                                                                                             //TODO: tokenization, put tokens in a String array. Hint:
  String pathToEngPOSModel = "lib/opennlp/models/tag.bin.gz";
                                                                                             //Tokenisation.java
  String pathToEngPhraseModel = "lib/opennlp/models/EnglishChunk.bin.gz";
                                                                                              String[] tokens = null;
                                                                                              //TODO: POS tagging, put tags in a String array. Hint: POSTagger.java
  SentenceDetector detector = new SentenceDetector("lib/opennlp/models/
     EnglishSD.bin.gz");                                                                      String[] tags = null;
  Tokenizertokeniser = new Tokenizer(pathToEngTokenisationModel);                             //This is the method you use to chunk phrases on a list of tokens and
  PosTagger tagger = new PosTagger(pathToEngPOSModel, (Dictionary) null);                     //a list of tags
                                                                                              String[] phrases = chunker.chunk(tokens, tags);
  TreebankChunkerchunker = new TreebankChunker(pathToEngPhraseModel);                         //See the result
                                                                                              for(String p:phrases)
                                                                                             System.out.println(p);
                                                                                              ……
                                                                                            start = result[i];
                                                                                            i++;
                                                                                      } while (start < result[result.length - 1]);




a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                                         a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*
•! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ*                                                     •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ*
(LINE 44 in PhraseChunker.java)                                                        (LINE 78 in PhraseChunker.java)
int[] result = detector.sentPosDetect(content);                                              String npstart = "B-NP";             a%&+*0$%7*36"+*It*%"2-$&,*/$%(+,,+,*
int start = 0, i = 0;                                                                        String vpstart = "B-VP";
                                                                                                                                  #.6,*$+,'3#*-"&*4+"+$-#+,*#.+*$+-3*
do {                                     A.+*$+,'3#*6,*"%#*+>-(#3G*#.+*/.$-,+,*2+*           String npcontinue = "I-NP";
                                                                                                                                  /.$-,+,*
      //sentence splitting               +>/+(#+&O*@'#*-*36,#*%0*c#-4,dO*2.6(.*-$+*          String vpcontinue = "I-VP";
                                         (%77%"3G*',+&*6"*ZVU*/.$-,+*
      String sentence = content.substring(start, result[i]);                                 String other = "O";
     //TODO: tokenization, put tokens in (.'"16"4S* array.
                                          a String                                                String phrase = "";
     String[] tokens=null;                                                                       for (int n = 0; n < tokens.length; n++) {
                                         X<ZU*****C%%"+G     *     *C%%"+G*
     //TODO: POStagging, put tags in a String array. Hint: POSTagger.java                             if (phrases[n].equals(npstart) || phrases[n].equals(vpstart)) {
     String[]–tags = null;
           B “begin”
                                         X<}U******0-63,*                                                 phrase = tokens[n];
     //ThisI is“inside”
             – the method you use to chunk phrases on a list of tokens and                                for (int m = n + 1; m < tokens.length; m++) {
                                         !<}U*******#%    *  *     *0-63,*#%*+"&*
      //a list – “Noun phrase”
           NP of tags                                                                                         if (phrases[m].equals(npcontinue) ||
                                         !<}U*******+"&*
     String[] phrases phrase”
           VP – “Verb = chunker.chunk(tokens, tags);
                                         X<ZU*****4%-3*                                       phrases[m].equals(vpcontinue)) {
     //See the result
                                         !<ZU******&$%'4.# *
     for (int k = 0; k < phrases.length; k++) {
                                                                   *4%-3*&$%'4.#*                                phrase = phrase+" "+tokens[m];
                                                                                                            } else {
         System.out.println(phrases[k] + "tt" + tokens[k]);
                                                                                                                 System.out.println("Actual phrase: "+phrase);
     }
                                                                                                                 phrase = "";
     ……
                                                                                                                 break;
      start = result[i];
                                                                                       ...
      i++;
                                                                                             }
} while (start < result[result.length - 1]);
a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,*                                  g%$+*+>+$(6,+,*60*G%'*-$+*6"#+$+,#+&*
•! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ%
   –! A.+*-",2+$b;*                                                             •! C+/+-#*/$+86%',*#-,1,*',6"4*#.+*(%$/',*4+"+$-#+&*
 (LINE 44 in PhraseChunker.java)                                                   ',6"4*#.+*#26:+$*-"&*0-(+@%%1*9U!,*
 int[] result = detector.sentPosDetect(content);
 int start = 0, i = 0;                                                          •! A$GS*
 do {
         //sentence splitting                                                      –! F+"#+"(+*,+47+"#-)%"*
         String sentence = content.substring(start, result[i]);
        //TODO: tokenization, put tokens in a String array.
                                                 B-NP         Rooney
                                                                                   –! A%1+"6,-)%"*
         String[] tokens=tokeniser.tokenize(sentence);
                                                 B-VP
         //TODO: pos tagging, put tags in a String array.
                                                              fails                –! U-$#<%0<,/++(.*#-446"4*
                                                 I-VP         to
         String[] tags = tagger.tag(tokens);
                                                 I-VP on a list of tokens
         //This is the method you use to chunk phrases
                                                              end                  –! U.$-,+*(.'"16"4*
         //and a list of tags                    B-NP         goal
         String[] phrases = chunker.chunk(tokens,I-NP
                                                   tags);     drought
         //See the result                        O        .
         for(String p:phrases)                   Actual phrase: Rooney
       System.out.println(p);                    Actual phrase: fails to end
          ……                                     Actual phrase: goal drought
         start = result[i];
         i++;
 } while (start < result[result.length - 1]);




Z+>#*                                                                          Q%7-6"*A+$7*C+(%4"6)%"*

 •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U%                        •! R.9/S*+>#$-(#*,#-),)(-33G*,64"6D(-"#*#+$7,O*2.6(.*
    G&%D.//.G%-"&$&%$-&0$%                                                         (%33+()8+3G*&+#+$76"+*#.+*,'77-$G*%0*#.+*7-#(.*
     –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+*                              •! 8&)90T**&%7-6"*#+$7*$+(%4"6)%"*/$%(+&'$+*
        H#%1+"6,-)%"O*UEF*#-446"4K*                                                –! KLJ%0(.)&$$&$%#%*6&+")0G*(-"&6&-#+*3+>6(%",O*+;4;O*
     –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#*                               "%'"</.$-,+,O*+"))+,*
        H/.$-,+*(.'"16"4O*+")#G*$+(%4"6)%"K*                                       –! C-9:$:)9/%M&9$2(&$%#%*+8-3'-#+*#.+*,64"6D(-"(+*%0*
                                                                                      (-"&6&-#+*3+>6(%",*
     –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"*H#+$7*
                                                                                      •! #+$7*0$+5'+"(G~*•<6&0~*2+6$&"+,,O*43%,,+>O*(<8-3'+O*
        $+(%4"6)%"K*                                                                     #+$7+>*
Q%7-6"*A+$7*C+(%4"6)%"*                                                        ]9AC**]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#*
 •! R.9/S*+>#$-(#*,#-),)(-33G*,64"6D(-"#*#+$7,O*2.6(.*                          •! ;I!8%%
    (%33+()8+3G*&+#+$76"+*#.+*,'77-$G*%0*#.+*7-#(.*                                –! ]-8-<@-,+&*#%%316#*0%$*&+8+3%/6"4*-"&*#+,)"4*&%7-6"*
 •! 8&)90T**&%7-6"*#+$7*$+(%4"6)%"*/$%(+&'$+*                                         #+$7*$+(%4"6)%"*-34%$6#.7,*
     –! KLJ%0(.)&$$&$%#%*6&+")0G*(-"&6&-#+*3+>6(%",O*+;4;O*                     •! B$&%;I!8%-.%
        "%'"</.$-,+,O*+"))+,*                                                      –! +>#$-(#*&%7-6"*#+$7,*0$%7*-*(%33+()%"*%0*
     –! C-9:$:)9/%M&9$2(&$%#%*+8-3'-#+*#.+*,64"6D(-"(+*%0*                            &%('7+"#,*
        (-"&6&-#+*3+>6(%",*                                                             •! J*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&*
         •! #+$7*0$+5'+"(G~*•<6&0~*2+6$&"+,,O*43%,,+>O*(<8-3'+O*                   –! 67/3+7+"#*-&&6)%"-3*-34%$6#.7,*
            #+$7+>*
                                                                                   –! +8-3'-#+**&6{+$+"#*-34%$6#.7,*'"&+$*#.+*,-7+*
                                                                                      0$-7+2%$1*




]9AC**]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#*                                  Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
 •! ;I!8%6,*-*]-8-<@-,+&*#%%316#*0%$*&+8+3%/6"4*-"&*                            •! ;I!8%*
    #+,)"4*&%7-6"*#+$7*$+(%4"6)%"*-34%$6#.7,*                                      –! @-,6(*D(&h2&,)1%M&9$2(&*
                                    !"*#.+*0%33%26"4*+>+$(6,+O*G%'*2633*',+*
 •! B$&%;I!8%-.%                  #.+,+*-34%$6#.7,*-"&*(%7/-$+*#.+*$+,'3#,**       –! J*-&&6)%"-3*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&%
     –! +>#$-(#*&%7-6"*#+$7,*0$%7*-*(%33+()%"*%0*                                     •! A+$7*0$+5'+"(G*6"8+$,+*&%('7+"#*0$+5'+"(G*H•<6&0K*
        &%('7+"#,*                                                                    •! a<}-3'+*                      @-(14$%'"&*6,*(%8+$+&*6"*
          •! J*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&*                               •! `+6$&"+,,*                    #.+%$G*,36&+,*H€[vl<*[IvK*

     –! 67/3+7+"#*-&&6)%"-3*-34%$6#.7,*                                               •! T3%,,-$G*+>#$-()%"*HT3%,,+>K*
     –! +8-3'-#+**&6{+$+"#*-34%$6#.7,*'"&+$*#.+*,-7+*                                 •! A+$7*+>#$-(#%$*HA+$7+>K*
        0$-7+2%$1*                                                              •! (-"*@+*',+&*-,*-*(%77-"&<36"+*@-,+&*-//36(-)%"*
                                          9&8-"(+&*#%/6(**2633*@+*
                                              (%8+$+&*@$6+zG*
Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                         Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
 •! ^.G%-.%2$&%;I!8%*                                                       •! ^.G%-.%2$&%;I!8%*
     –! V%(-#+*G%'$*]9AC*0%3&+$*                                               –! ,#-$#*#.+*-//36(-)%"O*+;4;O*#.+*0$+5'+"(G*7+-,'$+*
                                                                                 uk.ac.shef.wit.jatr.debug.TestFrequency
     –! (%"D4'$+*G%'$*-//36(-)%"*6"*Z9-(30(.0&(:&$*6"*
        pG%'$=N-#$q?#+,#*                                                      –! 26#.*76"67'7*7+7%$G*•7>J[^7*
        •! N-#$;,G,#+7;"3/rpG%'$=N-#$q?"3/=$+,%'$(+,*                          –! (%/G*N-#$*-"&*3%4Y*/$%/+$)+,*D3+,*#%*G%'$*(3-,,+,*0%3&+$*
          *9((+,,*#%*ZVU*#%%3,*$+5'6$+&*@G*]9AC*                                  -|+$*(3+-"*-"&*$+(%7/63+*
        •! N-#$;,G,#+7;#+$7;7->2%$&,rJ*                                        –! ,++*D3+*L5'6(1,#-$#;#>#M*6"*]9AC*0%3&+$*0%$*-&&6)%"-3*
          *g->67'7*"'7@+$*%0*2%$&,*6"*-*#+$7*                                     6"0%$7-)%"*
        •! N-#$;,G,#+7;#+$7;64"%$+=&646#,r#$'+*
          *a-"*-*#+$7*(%"#-6"*&646#,k    %%
                                                                               –! #+,#*26#.*#.+*#26:+$*-"&*0-(+@%%1*(%$/%$-*
     –! (%/G*/$%/+$)+,*D3+,*#%*pG%'$=N-#$q?(3-,,+,*




Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                         Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
82,,#,+%-"&%-&$-$%G#-"%9,-%                                                82,,#,+%-"&%-&$-$%G#-"%9,-*
•! -"#*,($6/#*,+#*'/*#%*$'"*0$%7*0%3&+$*pG%'$=N-#$q?#+,#*
•! &+0-'3#*-$4'7+"#,*
   –! /-#.=#%=(%$/',*r*0%3&+$*6"*pG%'$=N-#$q?#+,#?w)"Gw*
   –! /-#.=#%=$+0+$+"(+=(%$/',=,#-#,*r*pG%'$=N-#$q?w"3/=$+,%'$(+,?
      @"(='"60$5,;"%$7-3w*
•! #%*',+*-3#+$"-)8+*-$4,*+"#+$*%"+*%$*@%#.*%0*
   –! ant
        -Dpath_to_corpus=alt_corpus_path
        -Dpath_to_reference_corpus_stats=alt_reference_corpus_stats_path



•! %'#/'#*#%*#+,#*0%3&+$*
   –! /-:+$"S*!"#$%&'()*+),*B*9AC=9VTEC!Ang;#>#*
   –! %$*$'"*9VV*#+,#,*@G*(-336"4*LI/+.(#-"M!&$-&(M*
Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                              Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
 •! B,*&($-9,*#,+%-"&%.2-02-*                                                    •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi*
      –! A.+*/$%(+,,*%0*#.+*-//36(-)%"*6,*3%44+&*6"*cN-#$;3%4d*                      –! 3%%1*-#*$'"HK*7+#.%&*6"*
      –! A.+*$+,'3#,*-$+*%'#/'#*#%*-*D3+*(-33+&**                                       uk.ac.shef.wit.jatr.debug.TestFrequency.java

        *p-34%$6#.7="-7+q=9AC=934%$6#.7;#>#O*+;4;O*
         cF67/3+=#+$7=0$+5'+"(G=9AC=9VTEC!Ang;#>#d*                              Part 1: Extracting candidate terms by NLP9*c,#%/*2%$&d*36,#*6,*
                                                                                                                                     ',+&*#%*$+7%8+*"%6,+*
      –! 2.6(.*(%"#-6",*$-"1+&*36,#*%0*#+$7,*+>#$-(#+&*0$%7*#.+*                 //stop word list
                                                                                                                                     2%$&,O*+;4;O*L#.+MO*L-"&M*
                                                                                 StopList stop = new StopList(true);
         (%$/',O*%"+*#+$7*/+$*36"+S*
                                                                                 //lemmatiser
      2%$3&('/*‚`ECVQaeU*‚`%$3&a'/*‚2%$3&('/*‚`%$3&('/        ***[u^Y;_*
                                                                                 Lemmatiser lemmatizer = new Lemmatiser(); V+77-),-)%"*6,*',+&*#%*
                                                                                 //noun phrase extractor                           "%$7-36,+*#+$7,*#%*#.+6$*
                                                                                                                                 (-"%"6(-3*0%$7,*H,++*#.+%$G*
    A.+*D$,#*#+$7*                                                               CandidateTermExtractornpextractor = new
                                                            A.+*"'7@+$*6,*                                                              ,36&+,*[ll<*[lvK*
        6,*#.+*              A.+*%#.+$*#+$7,*-$+*                                   NounPhraseExtractorOpenNLP(stop, lemmatizer);
                                                            #.+*(-3('3-#+&*
   (-"%"6(-3*0%$7*           #.+*8-$6-"#,*0%'"&*6"*         ,(%$+*0%$*#.-#*      ……
     %0*-33*%0*6#,*               #.+*(%$/',*                    #+$7**                                          ]9AC*',+,*-*&+0-'3#*%/+"<"3/*
      8-$6-"#,**                                                                                                @-,+&*"%'"*/.$-,+*(.'"1+$*#%*
                                                                                                                   +>#$-(#*(-"&6&-#+*#+$7,*




Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                              Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
 •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi*                               •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi*
      –! 3%%1*-#*$'"HK*7+#.%&*6"*                                                                                             U$%(+,,%$,*
                                                                                 Part 1: Extracting candidate terms by NLP cont.
         uk.ac.shef.wit.jatr.debug.TestFrequency.java                            TermFreqCounter npcounter = new TermFreqCounter();$+5'6$+&*0%$*
                                                                                                                                             (%'")"4*#+$7*
                                                                                 WordCounter wordcounter = new WordCounter();
                                                                                                                                              0$+5'+"(6+,*
            Rooney, fails, to, end, goal, drought, .                             //create global resource index builder, which indexes
                                         9*c,#%/*2%$&d*36,#*
 Part 1: Extracting candidate terms by NLP                                          global resources,
 //stop word list                                   6,*',+&*#%*$+7%8+*
                                                        "%6,+*2%$&,*             //such as documents and terms and their relations
 StopList stop = new StopList(true);                                             GlobalResourceIndexBuilder builder = new
 //lemmatiser                                                                       GlobalResourceIndexBuilder();
 Lemmatiserlemmatizer = new Lemmatiser();          V+77-),-)%"*6,*',+&*#%*       //build the global resource index
 //noun phrase extractor                           "%$7-36,+*#+$7,*#%*#.+6$*
                                                                                 GlobalResourceIndex termDocIndex = builder.build(new      W%$*6"&+>6"4*#+$7,*
                                                 (-"%"6(-3*0%$7,*H,++*#.+%$G*
 CandidateTermExtractornpextractor = new                                            CorpusImpl(args[0]), npextractor);                      -"&*&%('7+"#,*
                                                         ,36&+,*[ll<*[lvK*
    NounPhraseExtractorOpenNLP(stop, lemmatizer);                                ….
 ……                                                                                                     !"8%16"4*ZVU*/$%(+,,+,*#%*$+-&*6"*
                                 ]9AC*',+,*-*&+0-'3#*%/+"<"3/*                                           &%('7+"#,O*,+47+"#*,+"#+"(+,O*
                                @-,+&*"%'"*/.$-,+*(.'"1+$*#%*                                            -//3G*#%1+"6,-)%"O*UEF*#-446"4O*
                                   +>#$-(#*(-"&6&-#+*#+$7,*                                                   -"&*/.$-,+*(.'"16"4*
Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                               Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*
 •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi*                                •! ^.G%-.%2$&%;I!8%*,'77-$G*
 Part 2: Apply statistical analyse on extracted terms.                               –! A%*$'"*%#.+$*-34%$6#.7,O*$+/3-(+*#.+*-34%$6#.7*#+,#+$*(3-,,*
 FeatureCorpusTermFrequency termCorpusFreq =                                            6"*#.+*(%77-"&S**
                                                             a$+-#+*0+-#'$+,*
     new FeatureBuilderCorpusTermFrequency(npcounter,
                                                             $+5'6$+&*@G*#.6,*         *N-8-*p7+7%$G=(%"D4q*<(3-,,/-#.*p-33=N-$=D3+,q*
    wordcounter, lemmatizer).build(termDocIndex);
                                                                /-$)('3-$*              '1;-(;,.+0;26#;N-#$;&+@'4;p&+,6$+&=-34%$6#.7=#+,#+$q*
                                                                -34%$6#.7*
 AlgorithmTester tester = new AlgorithmTester();                                       *p/-#.=#%=%'$=2%$3&=('/=(%$/',q*
 tester.registerAlgorithm(new FrequencyAlgorithm(), new                              –! F%'$(+*(%&+*-"&*N-8-&%(*-$+*-8-63-@3+*
    FrequencyFeatureWrapper(termCorpusFreq));
                                                                                     –! F%7+*-34%$6#.7,*7-G*-,1*0%$*-"*-&&6)%"-3*/-$-7+#+$*6"*
 tester.execute(termDocIndex);
                                                                                        #.+*(%77-"&S*p/-#.=#%=$+0+$+"(+=(%$/',=,#-#,q*
 System.out.println("Ended at: " + new Date()); a$+-#+*-"*6",#-"(+*%0*#.+*
                                                -34%$6#.7*0%$*#+,)"4*-"&*               •! #.+,+*-34%$6#.7,*',+*$+0+$+")-3*(%$/',*,#-),)(,*#%*(%7/'#+*
                                                   36"1*#%*6#,*$+5'6$+&*                   #.+*c#+$7"+,,d*
              !"8%1+*,#-),)(-3*-"-3G,6,O*6;+;O*          0+-#'$+,*
                                                                                        •! ',+*#.+*D3+*c@"(='"60$5,;"%$7-3d*H,#-#,*%0*#.+*X$6),.*
               (%7/'#+*#.+*,(%$+*',6"4*#.+*
                   -34%$6#.76(*0%$7'3-*                                                    Z-)%"-3*a%$/',K*'"&+$*cpG%'$=N-#$q?"3/=$+,%'$(+,q*.+$+*




Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                               Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*

•! j.(&%&'&()#$&$*                                                                •! I*69,)&*%-.0#)%5%*&6&/.0M&,-%2$#,+%;I!8*
    –! A$G*-33*#.+*-34%$6#.7,*%"*#.+*&6{+$+"#*(%$/%$-*                               –! A%*&+8+3%/*"+2*-34%$6#.7,*',6"4*]9ACO*G%'*7',#*
       /$%86&+&*0%$*G%'*                                                                 •! !7/3+7+"#*G%'$*%2"*-34%$6#.7O*67/3+7+")"4*
        •! ',6"4*#.+*`%$3&*a'/*(%$/',S*(%$/',*H#26:+$*P*0-(+@%%1K*                          #.+*6"#+$0-(+*
        •! #$G*#.+*`616/+&6-*(%$/',*0%$*-$)(3+,*-@%'#*-"67-3,*&-#-*                       uk.ac.shef.wit.jatr.core.algorithm.Algorithm
           H-"67-3(%$/',K*                                                              •! !7/3+7+"#*G%'$*%2"*-34%$6#.7*0+-#'$+*
             –!(%7/-$+*#.+*+{+(#*%0*&6{+$+"#*(%$/%$-*%"*-(('$-(G*                          2$-//+$*
                                                                                            –!#%*0+#(.*0+-#'$+,*$+5'6$+&*@G*G%'$*-34%$6#.7*
                                                                                            –!G%'$*(3-,,*7',#*+>#+"&*
                                                                                              uk.ac.shef.wit.jatr.core.algorithm.Abstrac
                                                                                              tFeatureWrapper
                                                                                        •! 36,#*%0*+>-7/3+,*(-"*@+*0%'"&*6"*#.+*/-(1-4+*
                                                                                          uk.ac.shef.wit.jatr.core.algorithm
Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC*                                    A.+*R"&*<*F'77-$G*
•! I*69,)&*%-.0#)%5%*&6&/.0M&,-%2$#,+%;I!8*
                                                                       •! @,%-"#$%&'&()#$&%G&%"96&*
    –! !7/3+7+"#*"+2*0+-#'$+,*<*0%$*+-(.*($+-#+**                         –! V+-$"#*#%*',+*#.+*0-(+@%%1*-"&*#26:+$*9U!,**
      •! -*"+2*(3-,,*+>#+"&6"4*                                               •! #%*(%33+(#*6"#+$+,)"4*&-#-*0%$*,/+(6D(*-//36(-)%"*
        uk.ac.shef.wit.jatr.core.feature.AbstractFeature*
                                                                                 /'$/%,+*
      •! -"%#.+$*(3-,,*+>#+"&6"4**
        uk.ac.shef.wit.jatr.core.feature.AbstractFeature                  –! V+-$"#*#%*',+*E/+"ZVU**
        Builder                                                               •! #%*/+$0%$7*@-,6(*ZVU*#-,1,*
   –! -*36,#*%0*+>-7/3+,*(-"*@+*0%'"&*6"*#.+*/-(1-4+*                     –! V+-$"#*#%*',+*]9AC**
     uk.ac.shef.wit.jatr.core.feature
                                                                              •! #%*/+$0%$7*&%7-6"*#+$7*$+(%4"6)%"*0$%7*-*(%$/',*
   –! A$G*%#.+$*ZVU*#%%3,*
                                                                          –! A+,#+&*E/+"ZVU*-"&*]9AC*%"**
   –! a$+-#+*G%'$*%2"*7+#.%&,*0%$*+>#$-()"4*(-"&6&-#+*#+$7,*H+;4;O*
                                                                              •! -*0-(+@%%1*(%$/',**
      "<4$-7*6",#+-&*%0*"%'"*/.$-,+,K*
                                                                              •! -*#26:+$*(%$/',*
      •! ,++*uk.ac.shef.wit.jatr.core.npextractor




A.+*R"&**W6"-3*`%$&,*                                                A.+*R"&**9*X64*A.-"1*i%'h*

 •! =,.G/&*+&%9)h2#$#:.,%D(.M%$.)#9/%
    ,&-G.(F#,+%$#-&$%#$%)"9//&,+#,+*
     –! R>+$(6,+,*,.%2*#.-#*#,D.(M9/%/9,+29+&%-"&*$".(-U%
        -&($&%M&$$9+&$*(-',+*6"-(('$-(6+,*6"*$+,'3#,*
     –! A.6,*6,*.%2+8+$*"%#*-*$+-3<2%$3&*-//36(-)%"*
                                                                             A.-"1*G%'*8+$G*7'(.*0%$*
         •! 3-$4+$*&-#-*,+#,O*7%$+*8-$6+&*&-#-*"+(+,,-$G*#%*                  -:+"&6"4*#.6,*#'#%$6-3h*
            -//$+(6-#+*0'33*,(-3+*%0*(.-33+"4+,*
            –!n%2*#%*D3#+$*$+-3*',+0'3*#+$7,*0$%7*#.+*$+,'3#*
              -((%$&6"4*#%*',+$*6"#+$+,#k*
            –!n%2*#%*36"1*#.+*#+$7,*#%*#.+6$*(%"#+>#*,%*#.+G*7-1+*
              ,+",+k*
            –!-"&*7-"G*7%$+*5'+,)%",*#%*(%",6&+$b*

More Related Content

PDF
Innotech - Get Me a Mobile Strategy or You’re Fired!
PDF
Where 2.0 -- Get me a mobile strategy or you’re fired!
PDF
Get Me a Mobile Strategy or You're Fired - Central Oregon Ad Fed
PDF
Get me a mobile strategy or you're fired web 2
PDF
Mobile: The Market, The Web and Windows Phone’s Future
PDF
Google Talk: DOs and DON'Ts of Mobile Strategy
PDF
Get Me a Mobile Strategy or You're FIRED!
PDF
User Engagement - A scientific Challenge
Innotech - Get Me a Mobile Strategy or You’re Fired!
Where 2.0 -- Get me a mobile strategy or you’re fired!
Get Me a Mobile Strategy or You're Fired - Central Oregon Ad Fed
Get me a mobile strategy or you're fired web 2
Mobile: The Market, The Web and Windows Phone’s Future
Google Talk: DOs and DON'Ts of Mobile Strategy
Get Me a Mobile Strategy or You're FIRED!
User Engagement - A scientific Challenge

What's hot (19)

PDF
Worm Composting Instructions
PDF
Innovations democra tic-document-veille-slideshare
PDF
Suitcase magazine
PDF
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
PDF
Ipad gump
PDF
Vietnamese favorite celebrities
PDF
Site 6 orientacao 2
PDF
Mobile is the future: Do you have your strategy?
PDF
Buku panduan pengelolaan-e-journal
PDF
QUIETING THE ECHOES - a case study for creatives
PDF
Brinch hansen
PDF
CompTIA IT - Skills Gaps Study International
PDF
Urban Agriculture Australia & Canberra City Farm
PDF
Travel & Lifestyle
PDF
Guia Cursos Formación General
PDF
Public international-law-notesp
PDF
PDF
YouTube popularity in Vietnam
PDF
الفروقات الفردية بين الطلاب كيف نفهمها
Worm Composting Instructions
Innovations democra tic-document-veille-slideshare
Suitcase magazine
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
Ipad gump
Vietnamese favorite celebrities
Site 6 orientacao 2
Mobile is the future: Do you have your strategy?
Buku panduan pengelolaan-e-journal
QUIETING THE ECHOES - a case study for creatives
Brinch hansen
CompTIA IT - Skills Gaps Study International
Urban Agriculture Australia & Canberra City Farm
Travel & Lifestyle
Guia Cursos Formación General
Public international-law-notesp
YouTube popularity in Vietnam
الفروقات الفردية بين الطلاب كيف نفهمها

Viewers also liked (16)

PPS
Locklear
PDF
Stretching the Life of Twitter Classifiers with Time-Stamped Semantic Graphs
PPT
Pedir Servir Traer
PPTX
PPTX
Violence det ijcnlp13-slideshare
PDF
Harnessing Linked Knowledge Sources for Topic Classification in Social Media
KEY
Product CEO vs The World
PDF
Detecting child grooming behaviour patterns on social media
PPTX
Representing, Proving and Sharing Trustworthiness of Web Resources Using Vera...
PDF
A Study of the Impact of Persuasive Argumentation in Political Debates
PDF
Volatile Classification of Point of Interests based on Social Activity Streams
PDF
Sensing 
Presence
(PreSense)
Ontology
–
 
User 
Modelling
 in 
the 
Semantic ...
PPT
Units Of Measurement Spanish
PPT
Introduction to Biometric lectures... Prepared by Dr.Abbas
PPT
Reflexive Verb Intro
PPT
El Modo Imperativo Updated
Locklear
Stretching the Life of Twitter Classifiers with Time-Stamped Semantic Graphs
Pedir Servir Traer
Violence det ijcnlp13-slideshare
Harnessing Linked Knowledge Sources for Topic Classification in Social Media
Product CEO vs The World
Detecting child grooming behaviour patterns on social media
Representing, Proving and Sharing Trustworthiness of Web Resources Using Vera...
A Study of the Impact of Persuasive Argumentation in Political Debates
Volatile Classification of Point of Interests based on Social Activity Streams
Sensing 
Presence
(PreSense)
Ontology
–
 
User 
Modelling
 in 
the 
Semantic ...
Units Of Measurement Spanish
Introduction to Biometric lectures... Prepared by Dr.Abbas
Reflexive Verb Intro
El Modo Imperativo Updated

Similar to Ekaw2010 tutorial3 practical (20)

PDF
Data Citation from the perspective of tracking data reuse
PDF
Crown Partners Social Media in the Enterprise
PDF
Social media for communicators
PDF
The Why & How Of Social Media In English
PDF
Social Signals & Search
PDF
The Science of Search, Google & Social Signals
PDF
Portfolio
PDF
The Pixel Lab 2011-Ben Grass: Financing & Partnerships
PDF
DIREITOS FUNDAMENTAIS NA PRESTAÇÃO SEGURANÇA PÚBLICA
PDF
Direitos Fundamentais na Prestação da Segurança Pública
PDF
Optimisation tnc1
PDF
#truShanghai and #truBeijing April 15-17th
PDF
Filtrowanie treści - dylematy operatorów serwisów społecznościowych
PDF
Wierzbowski
PDF
Slimmer werken aan zorgdossiers in organisaties
PDF
PDF
Business research
PDF
WALA Tutorial at PLDI 2010
PDF
What Is Social Media
 
PDF
The Case for B2B Social Media: Womma Webinar
Data Citation from the perspective of tracking data reuse
Crown Partners Social Media in the Enterprise
Social media for communicators
The Why & How Of Social Media In English
Social Signals & Search
The Science of Search, Google & Social Signals
Portfolio
The Pixel Lab 2011-Ben Grass: Financing & Partnerships
DIREITOS FUNDAMENTAIS NA PRESTAÇÃO SEGURANÇA PÚBLICA
Direitos Fundamentais na Prestação da Segurança Pública
Optimisation tnc1
#truShanghai and #truBeijing April 15-17th
Filtrowanie treści - dylematy operatorów serwisów społecznościowych
Wierzbowski
Slimmer werken aan zorgdossiers in organisaties
Business research
WALA Tutorial at PLDI 2010
What Is Social Media
 
The Case for B2B Social Media: Womma Webinar

Ekaw2010 tutorial3 practical

  • 1. EKAW 2010 • Tutorial T3 Friday • 15th october 2010 Knowledge Acquisition from Social Networking Sites Z. Zhang, A.E. Cano, K. Elbedweihy, A.-S. Dadzie
  • 2. !"#$%&'()%"* 9*36:3+*.%',+1++/6"4*;;;* !"#$%&'&()#$&%#$%*&$#+,&*%-.%"&/0%1.2%333% 4./*&(%$-(2)-2(&%5%-.0%/&6&/%7%+1-2<1-,"-=+>+$(6,+,?* •! &-#-* •! '"&+$,#-"&*#.+*/$%(+&'$+*%0*1"%23+&4+*-(5'6,6)%"* –! data/animalcorpus/! 0$%7*,%(6-3*"+#2%$16"4*,6#+,* –! data/examples/! –! data/corpora/facebook_data | twitter_data/* •! 3+-$"*#%*',+*$+3+8-"#*#%%3,*#%*-(5'6$+*6"0%$7-)%"*-"&* •! (%&+* 1"%23+&4+*0$%7*,%(6-3*"+#2%$16"4*,6#+,* –! facebook/! –! twitter/! •! ($+-#+*-*,67/3+*-//36(-)%"*#%*&+7%",#$-#+*#.+* –! information_extraction/ekawtutorial/ | jatr_v1.0/* #+(."%3%46+,*6"*/$-()(+* •! +>#+$"-3*36@$-$6+,* –! lib/! •! &%2"3%-&,*0$%7*#'#%$6-3*2+@,6#+* http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/ tutorial_prep.html#exercise_downloads http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/ tutorial_prep.html#third_party_downloads 9*36:3+*.%',+1++/6"4*;;;* 9*36:3+*.%',+1++/6"4*;;;* 82,,#,+%-"&%900/#)9:.,$% B$#,+%9,-% •! A+,#*6"#+$"+#*(%""+()%"* –! #%*$'"*0-(+@%%1*-"&*#26:+$*+>-7/3+,* •! #+,#+&*26#.*;<=%>3?% •! 9,-*@'63&*,($6/#* –! @'63&;>73*B*+1-2;1-,"-;A+,#C'""+$*(3-,,** –! &%'@3+<(36(1*%"*,#-$#+$*D3+*0%$*+-(.*-//36(-)%"*-"&*E?F*EC * •! 7-G*"++&*#%*7%&60G*$64.#,*#%*+>+('#+*H(.7%&*IJJK* –! +"#+$*L-"#M*-#*(%",%3+*0%$*#%/*3+8+3*%0*+-(.*,%'$(+*(%&+*0%3&+$*EC* •! $-9,*9/.,&% –! ,+#*'/*(3-,,/-#.*HE?F*&+/+"&+"#K* –! (-33*N-8-(*26#.*+-(.*#+,#*(3-,,* •! @<A% –! ($+-#+*-*"+2*-//36(-)%"*',6"4*,$(*0%3&+$,*0%$*+-(.*%0*#26:+$O*0-(+@%%1*P*6+* –! ,+#*'/*(3-,,/-#.*H!QR*&+/+"&+"#K* –! ,+#*'/*-//36(-)%"*/$%/+$)+,*-"&*$'"*+-(.*7-6"*7+#.%&*
  • 3. 9*36:3+*.%',+1++/6"4*;;;* F+#'/* C-9,*9/.,&% D9)&E..F%9,*%-G#H&(%IJ@$% •! Q%('7+"#-)%"S* –! 0-(+@%%1*T$-/.*9U!S** http://developers.facebook.com/docs –! #26:+$*9U!** http://apiwiki.twitter.com/Twitter-API-Documentation •! F64"*'/S* –! 0-(+@%%1S*http://www.facebook.com –! #26:+$S*https://twitter.com/signup •! V6@$-$6+,* –! C+,#WXS*http://restfb.com* –! #26:+$YNS*http://twitter4j.org/en F+#'/* F(+"-$6%* K9-2(9/%L9,+29+&%J(.)&$$#,+%9,*%@,D.(M9:.,%A'-(9):.,% NO>O%C.2-"%ID(#)9%P.(/*%Q20%5%M9-)"%$2MM9(#$9:.,% •!*Q'$6"4*#.+*^_[_*`%$3&*a'/*#%'$"-7+"#*6"*F%'#.*90$6(-O* •!*E/+"ZVU*[;Y**]-8-*#%%316#*0%$*@'63&6"4*ZVU*-"&*!R*-//36(-)%",* #26:+$*-"&*0-(+@%%1*2+$+*',+&*+>#+",68+3G*-,*-*&6,(',,6%"* @%-$&*0%$*0-",*#%*+>(.-"4+*6"0%$7-)%"*-"&*%/6"6%",*-@%'#* –! (%"#-6",*/$+<@'63#*3-"4'-4+*7%&+3,*#%*@+*',+&*@G*E/+"ZVU*0%$* 3-"4'-4+*/$%(+,,6"4* 7-#(.+,b* http://opennlp.sourceforge.net –! *.'"&$+&,*%0*#.%',-"&,*%0*7+,,-4+,*2+$+*4+"+$-#+&*&-63G* http://oak.dcs.shef.ac.uk/ekaw_2010_ka_from_sna_tutorial/ %"*#.+*#2%*,%(6-3*"+#2%$16"4*,6#+,b* exercise_rscs/ie_models_eng.zip –! *-*3-$4+*/$%/%$)%"*%0*#.+,+*7+,,-4+,*&6,(',,*#.+*7-#(.*%0* #.+*&-Gb* •!*]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#*H]9ACK* •!*2+*-$+*6"#+$+,#+&*6"*-"-3G,6"4*#.+,+*7+,,-4+,** http://www.dcs.shef.ac.uk/~ziqizhang/resources/tools/ jatr_v1.0.zip –! #%*'"&+$,#-"&*2.-#*-$+*#.+*7%,#*/%/'3-$*#%/6(,*#.-#* 6"#+$+,#*/+%/3+b*
  • 4. F(+"-$6%*(%"#;* F(+"-$6%*9"-3G,6,* NO>O%C.2-"%ID(#)9%P.(/*%Q20%5%M9-)"%$2MM9(#$9:.,% 8&7)90%D(.M%-"&%M.(,#,+%$&$$#.,% •!*A%*&%*,%*2+*@'63#*-*c7-#(.*,'77-$6,-)%"d*-//36(-)%"* •!*.%2*#%*6&+")0G*,/+(6D(*(%"#+"#*%0*6"#+$+,#* –!%#,02-*<*).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.* –!*(%"#+"#*$+#$6+8-3*-"&*D3#+$6"4* –!%.2-02-*<*$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%#.-#*(-"*@+* •!*.%2*#%*/$%(+,,*#.+*(%"#+"#*-"&*7-1+*,+",+*%0*6#* ',+&*#%*,'77-$6,+*(%$/',*(%"#+"#* –!*6"0%$7-)%"*+>#$-()%"** –!*"-#'$-3*3-"4'-4+*/$%(+,,6"4** •!*e,6"4*#.+*+>#$-(#+&*#+$7,*2+*(-"*-"-3G,+*2.-#*.-,*@++"* #.+*0%(',*%0*&6,(',,6%"*%0*#.+*7-#(.*%0*#.+*&-G* •!*W%$*#.6,*8+$G*+>+$(6,+O*2+*,#'&G*#.+*7-#(.*@+#2++"* 9*f"%23+&4+* A,+/9,*%9,*%R&(M9,1%.,%-"&%NS-"%.D%;2,&%NO>O;* 9(5'6,6)%"*/$%(+,,* F(+"-$6%*9"-3G,6,* a%$/',*T+"+$-)%"* •!%@,02-T%).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.* •! R.9/S*($+-#+*-*(%$/',*%0*7+,,-4+,** –! *2+*"++&*#%*/6"</%6"#*$+3+8-"#*7+,,-4+,*%"*#26:+$*-"&* –! #.-#* &6,(',,* #.+* 7-#(.* @+#2++"* R"43-"&* -"&* T+$7-"G* 0-(+@%%1* %"*^I#.*]'"+*^_[_* –! %',6"4%-G#H&(%-"&%D9)&E..F%9U!,U%2+*-//3G*).,-&,-% •! @,02-S* (&-(#&69/%9,*%V/-&(#,+*#%*@'63&*#.6,*(%$/',* a%$/',* –! #26:+$*9U!*/$%86&6"4*-((+,,*#%*#26:+$*&-#-* 4+"+$-)%"* –! 0-(+@%%1*9U!*/$%86&6"4*-((+,,*#%*0-(+@%%1*&-#-* –! (%"#+"#* D3#+$6"4* /-$-7+#+$,* H#.+* R"43-"&<T+$7-"G* •!%W2-02-T%$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%* 7-#(.*%"*^I#.*]'"+*^_[_K* –! 2+*-//3G*@A%9,*%KLJ%%"*#.+*(%$/',*#%*-(.6+8+*#.6,*4%-3% •! W2-02-%% a%"#+"#* –! (%$/',*%0*7+,,-4+,*$+3-#+&*#%*%"3G*#.+*7-#(.*%0*6"#+$+,#* -"-3G,6,*@G*!R*
  • 5. a%$/',*T+"+$-)%"*',6"4*#26:+$* -G#H&(% a%&+*6"S*ekaw-kasna_exercises/twitter R>#+$"-3*36@,S*lib/twitter4j-core-2.1.6-SNAPSHOT.jar | log4j-1.2.15.jar a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3>%8AC!%IJ@%XI,9/1$#,+%-"&%02E/#)%:M&/#,&%$-9-2$Y% A'3>%8AC!%IJ@% •! U$%86&+,*7+#.%&,*0%$*0+#(.6"4*&-#-*$+3-#+&*#%S** •! 9"-3Gj+*#.+*,#$'(#'$+*-"&*(%"#+"#*%0*/'@36(* •! *A67+36"+,O*F#-#',O*e,+$,O*g+7@+$,O*,'@,($6@+$,O*0%33%2+$,O* )7+36"+*,#-#',+,* ,%(6-3*4$-/.,*+#(;* –! `.+$+*2-,*#.+*,#-#',*#2++#+&*0$%7k* –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%* –! `-,*6#*-*$+#2++#k -(#'-33G*&%*,%7+#.6"4h*<*R&6#*#.+*(3-,,S* R>+$(6,+* ekaw.kasna.twitter.StatusTest •! C+0+$*#%*#.+*A26:+$Y]*N-8-&%(*#%*(%7/3+#+*#.+*+>+$(6,+,S* *http://twitter4j.org/en/javadoc/index.html !(1%#-%1.2($&/DS*$'"*C-9-2$!&$-3Z969%
  • 6. a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3>%8AC!%IJ@% A'3>%8AC!%IJ@% •! 9",2+$ Twitter twitter = new try{ TwitterFactory().getInstance(); ResponseList<Status>publicTimeline = twitter.getPublicTimeline(); //*TODO Complete exercise and analyse structure and content of each status try{ GeoLocation geoLocation; //We request the public timeline, which returns a list of Status Place place; ResponseList<Status> publicTimeline = twitter.getPublicTimeline(); while (it.hasNext()){ /** Status st = it.next(); * Complete this exercise and analyse the structure and content log.info(st.getText()); of each of the Status. log.info(st.getSource()); * Have a look at the java doc of the Status Class, or just if ((geoLocation = st.getGeoLocation()) != null) check the available methods in your IDE log.info(geoLocation.toString()); */ if ((place = st.getPlace()) != null) { Iterator<Status> it = publicTimeline.iterator(); log.info(place.getFullName()); log.info(place.getBoundingBoxCoordinates().toString()); while (it.hasNext()){ } //TODO check what are the info you can get from a Status. } } } catch (TwitterException e){ •! !(1%#-%1.2($&/DS*+&6#*-"&*$'"*C-9-2$!&$-3Z969% } e.printStackTrace(); a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3>%8AC!%IJ@% A'3N%C&9()"%IJ@% •! E'#/'#**)7+36"+*,#-#', ??????????!!??888888888 RT @nico_news: ???????????????????????????????????????? http://bit.ly/aZcvfl <a href="http://twipple.jp/" rel="nofollow">?????/twipple</a> •! 933%2,*6"#+$-()%"*26#.*#26:+$* Southampton v Tranmere: Preview followed by live coverage of Saturday's game between Southampton and Tranmere in L... http://bit.ly/9N802N $&9()"*-"&*-(&,*$*&-#-* <a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a> Laper gueeee –! #%/*#%/6(,*#.-#*-$+*('$$+"#3G*#$+"&6"4*%"* <a href="http://www.snaptu.com" rel="nofollow">Snaptu.com</a> ?????????????????????????? / ?????????????????????????? A26:+$* •! !#*+>/%,+,*#.+*0%33%26"4*7+#.%&,S** <a href="http://www.echofon.com/" rel="nofollow">Echofon</a> Changing the Language of Oppression http://bit.ly/aXA4w3 #specialneeds <a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a> Are you attending the SuperSwarm at Jewel, Piccadilly tonight? Let's get an idea of numbers via my poll @ www.theprgeek.co.uk –! ,+-$(.O** #superswarmLDN web –! #$+"&,O** Simon Cowell To Receive Special Emmy Award: October 7, 2010: Music mogul and former American Idol judge Simo... http:// tinyurl.com/299o5gg –! #$+"&,?('$$+"#O*#$+"&,?&-63GO*#$+"&,? <a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a> "Wajahmu seperti bulan" --» ini artinya ngatain kan yah? Org bulan bolong2 2++13G* <a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry®</a> FM???????????? <a href="http://stone.com/Twittelator" rel="nofollow">Twittelator</a> •! A.+*F+-$(.*9U!*,'//%$#,*-7%"4* ???? [????:?????/????????????????????????]559 #colopl_msg <a href="http://t.colopl.jp/t/" rel="nofollow">Colotwi</a> %#.+$,O*#.+*0%33%26"4*%/+$-#%$,*0%$* pikiran saya cabangnya banyak, jd pusing sendiri..penuh rasanya ni kepala (%",#$'()"4*-*5'+$G*,#$6"4* <a href="http://m.tweete.net" rel="nofollow">m.tweete.net</a>...
  • 7. a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3[%C&9()"%IJ@% A'3N%C&9()"%IJ@% –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%*-(#'-33G*&%* $#,)&#*T% F/+(6D+,*#.+*6&*%0*#.+*,#-#',*0$%7*2.6(.*#%*,#-$#*#.+*,+-$(.* ,%7+#.6"4h*<*R&6#*#.+*(3-,,S* 2,:/#*T% F/+(6D+,*#.+*6&*%0*#.+*,#-#',*0$%7*2.6(.*#%*+"&*#.+*,+-$(.* ekaw.kasna.twitter.QueryTest R>+$(6,+* C#,)&T% F#-#',+,*/$%&'(+&*,6"(+*-*,/+(6D+&*&-#+*H+;4;*^_[_<_l<[_K* B,:/T% Query query = new Query(); V/-&(T/#,F$% C+#$6+8+,*#2++#,*26#.%'#*36"1,* query.query("football"); D(.MT% C+#$6+8+,*,#-#',+,*0$%7*-*468+"*',+$;*H+;4;*0$%7S*D0-K* //*TODO Modify the query object, and search for /9,+T% C+#$6+8+,*,#-#',+,*6"*-*468+"*3-"4'-4+* today's tweets (in english) related to football W8% +;4;O*7+")%"6"4*g+>6(%*EC*W$-"(+* //*TODO Restrict your results to tweets generated within 300 kilometers of Johannesburg, South Africa T%Y% +;4;O*(%"#-6"6"4*0%%#@-33*26#.*-*/%,6)8+*-m#'&+*H+;4;*0%%#@-33*SK*K* // hint: use Query's geoCode method, the K&+9:.,% +;4;O*7+")%"6"4*@++$*@'#*"%#*$%%#* Kilometers unit is given as Query.KILOMETERS // hint: South Africa's lat: 26.12, long: 28.2 C.2()&T% +;4;O*a%"#-6"6"4*0%%#@-33*+"#+$+&*86-*A26:+$W++&*H+;4;*"+2,* ,%'$(+SA26:+$W++&K* •! !(1%#-%1.2($&/DS*$'"*]2&(1!&$-3Z969% a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3>%8AC!%IJ@% A'3N%C&9()"%IJ@% •! E'#/'#**5'+$G*$+5'+,#*0%$*L0%%#@-33M*"+-$*L]%.-""+,@'$4M –! I,$G&(% hits:15 Query query = new Query(); MQMhlanzi:Total Football 360: Bafana Eager to Keep the Momentum of Winning! http://t.co/xOPTaY9 Benleeds:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or query.query("football"); Morecambe or Dagenham and Redbridge? Tumelo13:Gota admit I miss my NONstop #football convo's wit @Denisao_4 and @GordonTyler8! Haha talk bout nothing but the #beautifulgame //*TODO Modify the query object, and search for Tumelo13:RT @Denisao_4: Ey bra @Tumelo13 that's not a sin! That's for the love of football! I approve wow! Let's hope it works :)?? today's tweets related to football Amen Edwardo84:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer jonerz97:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Morecambe //*TODO Restrict your results to tweets generated or Dagenham and Redbridge? within 300 kilometers of Johannesburg, South Africa dcocker11:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer AntimoOsato91:@siasduplessis Oros and The Dutch National Football Team could be good sponsors too! Haha :) IsaacTeka:#football - EURO 2012 qualifier between Germany and Turkey is gonna be a fierce encounter. #Ozil and #Khedira // hint: use Query's geoCode method, the applenessuk:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer johnyrotten:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Kilometers unit is given as Query.KILOMETERS Morecambe or Dagenham and Redbridge? // hint: Johannesburg’s lat: 26.12, long: 28.2 kartikverma:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer query.geoCode(new GeoLocation(26.12,28.2), RawRemedy:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Morecambe or Dagenham and Redbridge? 30,Query.KILOMETERS); TLW1Dan:RT @BumbleCricket: Liverpool FC ...what a mess ...greed rears its head again ...football and fans suffer jopayne:RT @BumbleCricket: any big shot yank out there SO intersted in football that he would like to buy Accrington or Morecambe or Dagenham and Redbridge?
  • 8. a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3[%C-(&9M%IJ@% A'3[%C-(&9M%IJ@% Twitter 4j allows you to retrieve streaming samples using the class RestAPI and SearchAPI only present a limited snapshot of TwitterStream. For the public timeline you just need basic a timeline. During the finals of the 2010 World Cup authentication. the rate of tweets containing the tags #Spain, #Netherlands, #Germany, [*** Create a TwitterStream instance #Uruguay, was quite high. twitterStream = new TwitterStreamFactory(this).getInstance("yourAcc","yourPass"); Two options: Set a Listener for receiving the event of a status. Your listener should ^* •! make requests, say, every 2sec implement the method public void onStatus(Status status) through the RestAPI or the Search API, •! BETTER: twitterStream.setStatusListener(this); •! start listening to a stream of public l*** Start Sampling tweets & twitterStream.sample(); •! filter according to the tag patterns Y* Do something with the tweet in your onStatus method a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* A'3[%C-(&9M%IJ@% A'3[%C-(&9M%IJ@% –! P9#-h*i%'*2633*"++&*#%*(%7/3+#+*#.+*(%&+*0%$*6#*#%*-(#'-33G*&%* –! I,$G&( ,%7+#.6"4h*<*R&6#*#.+*(3-,,S* ekaw.kasna.twitter.StreamTest private void startConsuming() throws TwitterException { twitterStream.setStatusListener(this); private void startConsuming() throws TwitterException { //*TODO Using TwitterStream’s filter method, twitterStream.setStatusListener(this); restrict your sampling to collect tweets that include the words: football, worldcup, final //*TODO Using TwitterStream’s filter method, restrict your sampling to collect tweets that include String[] filterWords = {"#worldcup", "#WorldCup", the words: football, worldcup, final "#Worldcup", "#WORLDCUP"}; twitterStream.setStatusListener(this); twitterStream.sample(); twitterStream.filter(0,null,filterWords); } twitterStream.sample(); } •! !(1%#-%1.2($&/DS*$'"*C-(&9M!&$-3Z969%
  • 9. a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* I**#:.,9/%A'&()#$&T%I2-"&,:)9:.,% •! Try it yourself! •! Authenticating using Oauth •!*$+,#$6()%",*#%*-((+,,6"4*/$68-#+*&-#-hhh* •! OAuthTest.java •! Using the application “Ekaw-Kasna” •!%Q^IKRAC%CAJ%NO>O** •! Login with your twitter account and go to: •!*(.-"4+*#%*-'#.+")(-)%"*7%&+*0%$*$+#$6+86"4*6"&686&'-3,M* http://twitter.com/apps/new ,#-#',*6"0%$7-)%"* •!0$%7*-*,67/3+*',+$"-7+</-,,2%$&*#%S* •! W92-"7E9$&*%92-"&,:)9:.,*%0*$+46,#+$+&*c-//36(-)%",d* a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* •! I2-"&,:)9:,+%2$#,+%W92-"% –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z* •! +"#+$*#.+*eCV*-#*#.+*(%",%3+*6"*-*2+@*@$%2,+$* •! #%*%@#-6"*-"*%-'#.=#%1+"* i%'*2633*"++&*#.+,+*#2%* ,#$6"4,*0%$*-'#.+")(-)"4** i%'*2633*@+*4686"4* -'#.%$6j-)%"*#%*#.6,* -//36(-)%"*#%*-((+,,* G%'$*6"0%$7-)%"*
  • 10. a%$/',*T+"+$-)%"*',6"4*#26:+$* a%$/',*T+"+$-)%"*',6"4*#26:+$* •! I2-"&,:)9:,+%2$#,+%W92-"% •! I2-"&,:)9:,+%2$#,+%W92-"% –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z* –! C'""6"4*#.+*+>-7/3+*$+5'6$+,*-*U!Z* •! +"#+$*#.+*eCV*#%*%@#-6"*-"*%-'#.=#%1+"** •! +"#+$*#.+*eCV*#%*%@#-6"*-"*%-'#.=#%1+"** –! E"(+*G%'*c933%2d*-'#.%$6j-)%"*G%'*2633*@+*/$%86&+&* –! E"(+*G%'*c933%2d*-'#.%$6j-)%"*G%'*2633*@+*/$%86&+&* 26#.*#.+*U!ZS* 26#.*-*U!ZS* –! R"#+$*#.+*U!Z*#%*(%7/3+#+*-'#.+")(-)%"* A.6,*6,*#.+*U!Z* ciEe*9CR*9eAnRZA!a9ARQhhd* "++&+&*#%* (%7/3+#+*#.+* -'#.+")(-)%"* a%$/',*T+"+$-)%"*',6"4*0-(+@%%1* D9)&E..F% a%&+*6"S*ekaw-kasna_exercises/facebook R>#+$"-3*36@,S*lib/restfb-1.5.3.jar | log4j-1.2.15.jar
  • 11. 0-(+@%%1*9U!**W+#(.6"4*E@N+(#,* 0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-* •! The Graph API https://graph.facebook.com/facebook •! provides facilities for reading and writing data to facebook •! Each API request starts with the URL: https://graph.facebook.com •! e.g., data about any object can be found by fetching https://graph.facebook.com/objectID - objectID is the unique id of this object in the social graph - e.g., the unique id for a page is its name: https://graph.facebook.com/facebook 0-(+@%%1*9U!**a%""+()%",* 0-(+@%%1*9U!**a%""+()%",* •! All objects in the facebook social graph are connected via relationships (connections) •! Fetch connections https://graph.facebook.com/objectID/connection_type •! e.g., the page’s own posts https://graph.facebook.com/facebook/posts
  • 12. 0-(+@%%1*9U!**U-4+*a%""+()%",* 0-(+@%%1*9U!**W63#+$6"4*Q-#-* D&&*% A.+*/-4+M,*2-33* •! Data can be filtered using parameters 0#)-2(&% A.+*/-4+M,*/$%D3+*/6(#'$+* •! e.g., -9++&*% A.+*/.%#%,O*86&+%,O*-"&*/%,#,*6"*2.6(.*#.6,*/-4+*.-,*@++"*#-44+&* -! since, until ---> specify date ranges /#,F$% A.+*/-4+o,*/%,#+&*36"1,* -! limit ---> specify amount of returned data 0".-.$% A.+*/.%#%,*#.6,*/-4+*.-,*'/3%-&+&* +(.20$% A.+*4$%'/,*#.6,*/-4+*6,*-*7+7@+$*%0* 9/E2M$_6#*&.$% A.+*/.%#%*-3@'7,?86&+%,**#.6,*/-4+*.-,*($+-#+&* •! e.g., fetching the feed $-9-2$&$% A.+*/-4+o,*,#-#',*'/&-#+,* -! within specified dates and ,.-&$% A.+*/-4+o,*"%#+,* -! with a limit of 50 0.$-$% A.+*/-4+o,*%2"*/%,#,* https://graph.facebook.com/worldcup/feed? since=2010-07-17&until=2010-07-20&limit=50 M&ME&($% A.+*/-4+o,*7+7@+$,;*i%'*(-"*%"3G*5'+$G*'/*#%*J__*7+7@+$,;*!#*6,*"%#* /%,,6@3+*#%*6#+$-#+*#.$%'4.*#.+*36,#;*R>-7/3+S*.:/,S??4$-/.;0-(+@%%1;(%7? pU9TR=!Qq?7+7@+$,k3676#rJ__* &6&,-$% A.+*+8+"#,*#.6,*/-4+*6,*-:+"&6"4* )"&)F#,$% a.+(16",*7-&+*@G*0$6+"&,*%0*#.+*('$$+"#*,+,,6%"*',+$* 0-(+@%%1*9U!**W63#+$6"4*Q-#-* 0-(+@%%1*9U!**W6"&6"4*E@N+(#,** •! Search for objects https://graph.facebook.com/search? q=query&type=objectType c($+-#+&=)7+d*6,*26#.6"* - query ---> what you want to find #.+*,/+(6D+&*&-#+*$-"4+,* - objectType ---> type of the object (e.g. facebook post, user) •! e.g., search all public posts for “2010 world cup” https://graph.facebook.com/search?q=2010%20world %20cup&type=post
  • 13. 0-(+@%%1*9U!**W6"&6"4*E@N+(#,** 0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+* Try it yourself! •! Fetch the data about the page worldcup •! Get the feed of this page (hint: connection is feed) •! this is the wall for the page worldcup •! Return only the first 5 messages of this feed U%,#,*(%"#-6"6"4*#.+*#+$7,** c^_[_d*B*c2%$3&d*B*c('/d* •! Search for all pages containing worldcup in the page name 0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+* 0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+* •! ANSWERS •! ANSWERS •! page worldcup: •! Get the feed (wall) of the page worldcup: https://graph.facebook.com/worldcup/feed •! fetch https://graph.facebook.com/worldcup
  • 14. 0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+* 0-(+@%%1*9U!**T$-/.*9U!*R>+$(6,+* •! ANSWERS •! ANSWERS •! Return only the first 5 messages of the feed: •! Search for all pages containing worldcupin the https://graph.facebook.com/worldcup/feed&limit=5 page name https://graph.facebook.com/search?q=worldcup&type=page a36+"#*V6@$-$6+,* C+,#WX*9U!**`%$3&*a'/*F(+"-$6%** •! Multiple client libraries for facebook API •! Exercise: http://developers.facebook.com/search? get the messages sent on the day of the q=User:Client_Libraries England-Germany match - 27th of June 2010 •! RestFB client library was the first java library to support [*** Search for all pages containing “worldcup” the GraphAPI •! Other Java libraries now supporting GraphAPI ^* For every page: - BatchFB •! Get the messages posted on that day - TinyFBGraphClient •! Store the messages to generate your corpus - facebook Java Webapp •!We use the RestFB client library in this tutorial
  • 15. C+,#WX*9U!**Q+0-'3#0-(+@%%1a36+"#** C+,#WX*9U!**F+-$(.6"4* •! DefaultfacebookClient •! Step 1: •! provides methods for reading and writing data Connection<T> to facebook graph fetchConnection(String connection, Class<T> connectionType, FacebookClient facebookClient Parameter... parameters) = new DefaultfacebookClient(); 9((+,,*/'@36(*&-#-* facebookClient facebookClient = new DefaultfacebookClient(); facebookClient = new Connection<Page> pageSearch = DefaultfacebookClient(ACCESS_TOKEN); facebookClient.fetchConnection("search",Page.class, Parameter.with("q", "world cup"), Parameter.with("type", "page"), Parameter.with("limit", "10")); C+5'6$+&*#%*-((+,,*/$68-#+* &-#-*%$*+&6#?/'@36,.*&-#-* .:/,S??4$-/.;0-(+@%%1;(%7?,+-$(.k5r2%$3& B('/P#G/+r/-4+P3676#r[_* C+,#WX*9U!**F+-$(.6"4* C+,#WX*9U!**$+#'$"*0$%7*$+5'+,#*<*/-4+,* •! $+#'$",*-*36,#*%0*#.+*D$,#*[_*/-4+,*-@%'#*c2%$3&('/d* •! World Cup Pages •! W%$*+-(.*/-4+O*/$%/+$)+,*$+#'$"+&*6"(3'&+S* K9M&% Q9-&+.(1% @<% –! 6&O*"-7+O*(-#+4%$GO*0++&O*/6(#'$+,*b `%$3&*a'/* U%36)(6-",* J_tY[_YulvI* `%$3&*a'/* U$%&'(#,=%#.+$* [lJJJvYvuItt^lu* 4+#Q-#-*<<s*$+#'$",*-*36,#*%0*%@N+(#,*H&+/+"&6"4*%"*#.+* 2%$3&*('/* F/%$#,=-#.3+)(,* [lY[Ivl_l^vv_vl* (%""+()%"*$+5'+,#+&K* `%$3&*a'/*^_[_* U$%&'(#,=%#.+$* ^JIvvtYItvvv* C'4@G*`%$3&*a'/* F/%$#,=-#.3+)(,* [[v^Iv^l^[Il* for (Page page : pageSearch.getData()) { ^_[_*`%$3&*a'/* e"1"%2"* [^J_YtltY^_^^tJ* System.out.print("Name: " + page.getName()); w`ECVQ*aeUd* a3'@,* [^lvttYI^[Iv* System.out.print("Category: " + page.getCategory()); `%$3&*a'/*%"*RFUZ* F/%$#,=-#.3+)(,* [v[Jl[lt^_Y_* System.out.println("ID: " + page.getId()); `ECVQ*aeU* F/%$#,=#+-7,* [^_l_IlvYvv[_Jv* }* ^_[_*`%$3&*a'/* V%(-3=@',6"+,,* lvI[[uIIlt[v*
  • 16. C+,#WX*9U!**R>+$(6,+* C+,#WX*9U!**R>+$(6,+* Try it yourself! ANSWERS Connection<Group> groupSearch = •! Edit the class SearchTest.java facebookClient.fetchConnection( "search", Group.class, Parameter.with("q", "2010 world cup"), •! Search for all groups talking about a topic of Parameter.with("type", "group"), Parameter.with("limit", "15")); interest to you •! Get the first 15 groups for (Group group : groupSearch.getData()) { System.out.println("Name: " + group.getName()); •! For every group: System.out.println("ID: " + group.getId()); } - print name and ID C+,#WX*9U!**$+#'$"*0$%7*$+5'+,#*<*4$%'/,* C+,#WX*9U!**T+m"4*#.+*0++&* ‘2010 world cup’ groups K9M&% @<% •! Step 2: kkkkkkk**x-7-3+1*Ey(6-3*T$%'/* ^^JJ^[YItu[J* ^_[_*W!W9*`ECVQ*aeU* [^Y[Iulu_uJ[YJv* Connection<T> fetchConnection(String connection, ^_[_*W!W9*`%$3&*a'/* ^^_YtlvIYJ* Class<T> connectionType, ^_[_*W!W9*`ECVQ*aeU*FEeAn*9WC!a9* ^I_Ilt[tYJI* Parameter... parameters) ^_[_*W60-*`%$3&*a'/*F%'#.*90$61-* [^_uIl^[[^II[Ju* ^_[_*W!W9*`%$3&*a'/*F%'#.*90$6(-* [[[I_tJvJJ[YYlv* ^_[_*W60-*`%$3&*a'/*Q$6"16"4*T-7+* ^lv[^t[ut_^u* Connection<Post> myFeed = facebookClient.fetchConnection( ^_[_*W!W9*`ECVQ*aeU*FEeAn*9WC!a9* [_tJ^t^u^J[Jlt_* "worldcup/feed", Post.class, Parameter.with("since", g'"&6-3*^_[_*F'&-0$6(-*^_[_*`%$3&*('/* [uuv^tvtIlvl* "2010-06-27T11:00:00"), Parameter.with("until", "2010-06-28T17:00:00"), Parameter.with("limit", "10")); !#-36-*<*^_[_*W!W9*`%$3&*a'/* [tJYlYIlt^^* ^_[_<W!W9<`%$3&<a'/* [^vlIIll_I[^uIl* ^_[_*`%$3&*a'/** [[^_uJ^JttlJYYu* ^_[_*`%$3&*a'/* [ulll^l[vlIl* .:/,S??4$-/.;0-(+@%%1;(%7?2%$3&('/?0++&k ^_[_*W!W9*`%$3&*a'/* [l_YvttuvuvJYII* ,6"(+r^_[_<_v<^IP'")3r^_[_<_v<^tP3676#r^_* ^_[_*W!W9*`%$3&*a'/* [vl[Y_tt[uIt*
  • 17. CRFA*9U!**T+m"4*#.+*0++&* CRFA*9U!**$+#'$"*0$%7*$+5'+,#*<*0++&* Try it yourself! - ConnectionsTest.java •! 0++&*$+#'$",*-33*/%,#,*2$6:+"*%"*#.+*,/+(6D+&*&-#+* •! Message: the english were hoping to play penalties what a waste of their •! W%$*+-(.*/%,#*-:$6@'#+,*$+#'$"+&*6"(3'&+S* training time –! ($+-)%"*)7+O*/%,#*"-7+O*&+,($6/)%"b;* Creation Time: Sun Jun 27 17:45:13 BST 2010 •! Message: Deutschland, Deutschland über alles, über alles in der Welt Creation Time: Sun Jun 27 17:29:25 BST 2010 •! Message: world cup?? this wasn't a 'football games' but 'fakeball' games!! for (Post post : myFeed.getData()) { Lampard was scored but the referee was blind....4-1?? congrats to the referees coz they have a massive party tonite to celebrate!! $$$$$$$$$$$$$ System.out.println("Message: " + post.getMessage()); $$$ wow.... even can makes people blind!!! world cup??? **** off!!! System.out.println("tCreation Time" + Creation Time: Sun Jun 27 17:25:32 BST 2010 post.getCreatedTime()); •! Message: how are we suppose to be patriotic with a team that plays like }* that, none of them deserve the money they get, waste of time.............. Creation Time: Sun Jun 27 16:48:06 BST 2010 •! Message: john terry on england should get worst defender for the year...he's no good Creation Time: Sun Jun 27 16:42:39 BST 2010 CRFA*9U!**U%,#*U$%/+$)+,O*a%""+()%",* a%$/',*T+"+$-)%"*',6"4*0-(+@%%1* Properties I**#:.,9/%A'&()#$&T%I2-"&,:)9:.,% #*% A.+*/%,#*!Q* •!*$+,#$6()%",*#%*-((+,,6"4*/$68-#+*&-#-hhh* D(.M% 9"*%@N+(#*(%"#-6"6"4*#.+*!Q*-"&*"-7+*%0*#.+*',+$*2.%*/%,#+&*#.+*7+,,-4+* •!*9((+,,*A%1+"*$+5'6$+&*0%$*,%7+*7+#.%&,* -.% 9*36,#*%0*#.+*/$%D3+,*7+")%"+&*%$*#-$4+#+&*6"*#.6,*/%,#* M&$$9+&% A.+*7+,,-4+* •!#%*/$+8+"#*-((+,,*H$+-&*%$*2$6#+K*#%*/$68-#+*&-#-* 0#)-2(&% !0*-8-63-@3+O*-*36"1*#%*#.+*/6(#'$+*6"(3'&+&*26#.*#.6,*/%,#* •!+;4;O*/'@36,.6"4*#%*#.+*0-(+@%%1*,%(6-3*4$-/.* /#,F% A.+*36"1*-:-(.+&*#%*#.6,*/%,#* •!*X6&&6"4#%"*/$%86&+,*-*4%%&*+>/3-"-)%"*0%$*4+m"4*-((+,,*#%1+",*-#S* ,9M&% A.+*"-7+*%0*#.+*36"1* http://benbiddington.wordpress.com/2010/04/23/facebook-graph- )90:.,_*&$)(#0:.,% A.+*(-/)%"?&+,($6/)%"**%0*#.+*36"1*H-//+-$,*@+"+-#.*#.+*36"1*"-7+K* api-getting-access-tokens $.2()&% !0*-8-63-@3+O*#.+*,%'$(+*36"1*-:-(.+&*#%*#.6,*/%,#*H0%$*+;4;O*-*z-,.*%$*86&+%*D3+K* #).,% 9*36"1*#%*-"*6(%"*$+/$+,+")"4*#.+*#G/+*%0*#.6,*/%,#* 9H(#E2:.,% 9*,#$6"4*6"&6(-)"4*2.6(.*-//36(-)%"*2-,*',+&*#%*($+-#+*#.6,*/%,#* •!*+;4;O*0+#(.*#.+*0$6+"&,*%0*',+$*L1.-&6N-;+3@+&2+6.GM* 9):.,$% 9*36,#*%0*-8-63-@3+*-()%"*"-7+,*-"&*36"1,*H6"(3'&6"4*(%77+")"4O*3616"4*-"&*-"* •!*#.6,*$+5'6$+,*-'#.+")(-)%"**#%1+"*L>>`a`bO``O;;;M* %/)%"-3*-//<,/+(6D+&*-()%"K* https://graph.facebook.com/khadija.elbedweihy/ /#F&$% A.+*"'7@+$*%0*361+,*%"*#.6,*/%,#* friends&access_token=11585905509... )(&9-&*:M&% A.+*)7+*#.+*/%,#*2-,*6"6)-33G*/'@36,.+&* 20*9-&*:M&% A.+*)7+*%0*#.+*3-,#*(%77+"#*%"*#.6,*/%,#* 933*/$%/+$)+,*P* (%""+()%",*%0*-* •!%!(1%#-%1.2($&/D3;;;* Connections cU%,#d* ).MM&,-$% 933*%0*#.+*(%77+"#,*%"*#.6,*/%,#*
  • 18. 0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-* 0-(+@%%1*9U!**W+#(.6"4*e,+$*&-#-* https://graph.facebook.com/khadija.elbedweihy •! fetch specific fields https://graph.facebook.com/khadija.elbedweihy? fields=id,name,picture U'@36(*Q-#-*%"3G* V6"1*#%*#.+* /6(#'$+* U6(#'$+*-#*#.+* 468+"*36"1* 0-(+@%%1*9U!**9'#.%$6j-)%"*R>-7/3+* 0-(+@%%1*9U!**9'#.%$6j-)%"*R>-7/3+* 9((+,,*#%1+"*2%$1,* 0%$*#.+*-'#.%$6j+&* ',+$*%"3G** F-7+*-((+,,*#%1+"*0%$*-* &6{+$+"#*',+$*c*.&$%,.-% G.(Fd*
  • 19. 0-(+@%%1*9U!**e,+$*W6+3&,* 0-(+@%%1*9U!**e,+$*a%""+()%",* #*T% A.+*',+$M,*!Q* ".M&T% A.+*',+$M,*Z+2,*W++&;*C+5'6$+,*#.+*read_stream*/+$76,,6%"* V($-,9M&T% A.+*',+$M,*D$,#*"-7+* D&&*T% A.+*',+$M,*2-33;*C+5'6$+,*#.+*read_stream /+$76,,6%"*#%*,++* /9$-,9M&T% A.+*',+$M,*3-,#*"-7+* "%"</'@36(*/%,#,;* ,9M&T% A.+*',+$M,*0'33*"-7+* -9++&*T% A.+*/.%#%,O*86&+%,O*-"&*/%,#,*6"*2.6(.*#.6,*',+$*.-,*@++"* 9E.2-% A.+*',+$M,*@3'$@*#.-#*-//+-$,*'"&+$*#.+6$*/$%D3+*/6(#'$+* #-44+&;*C+5'6$+,*#.+*read_stream /+$76,,6%";* E#(-"*91% A.+*',+$M,*@6$#.&-G* 0.$-$T% A.+*',+$M,*%2"*/%,#,;*C+5'6$+,*#.+*read_stream /+$76,,6%"* G.(F_&*2)9:.,% 9*36,#*%0*#.+*2%$1?+&'(-)%"*.6,#%$G*0$%7*#.+*',+$M,*/$%D3+* #%*,++*"%"</'@36(*/%,#,;* &M9#/T% A.+*/$%>6+&*%$*(%"#-(#*+7-63*-&&$+,,*4$-"#+&*@G*#.+*',+$* 0#)-2(&T% A.+*',+$M,*/$%D3+*/6(#'$+* G&E$#-&% 9*36"1*#%*#.+*',+$M,*/+$,%"-3*2+@,6#+* D(#&,*$T% A.+*',+$M,*0$6+"&,* ".M&-.G,% A.+*',+$M,*.%7+#%2"* 9):6#:&$_#,-&(&$-$_ A.+*-()86)+,?6"#+$+,#,?7',6(?@%%1,?7%86+,?#+3+86,6%"*36,#+&*%"* M2$#)_E..F$_ #.+*',+$M,*/$%D3+* /.)9:.,% A.+*',+$M,*('$$+"#*3%(-)%"* M.6#&$_-&/&6#$#.,T% +&,*&(% A.+*',+$M,*4+"&+$* /#F&$T% 933*#.+*/-4+,*#.6,*',+$*.-,*L361+&M;*C+5'6$+,*#.+ user_likes %$* #,-&(&$-&*#,% T+"&+$,*#.+*',+$*6,*6"#+$+,#+&*6"* 0riend_likes*/+$76,,6%";* M&&:,+D.(% AG/+,*%0*$+3-)%",.6/,*#.+*',+$*6,*,++16"4* 0".-.$T% A.+*/.%#%,*#.6,*',+$*6,*#-44+&*6";*C+5'6$+,*#.+* user_photo_video_tagsO*friend_photo_video_tag,*-"&* (&/9:.,$"#0$-9-2$% A.+*',+$M,*$+3-)%",.6/*,#-#',* user_photos*%$ friend_photos*/+$76,,6%",;* (&/#+#.,% A.+*',+$M,*$+3646%"* F(+"-$6%*9"-3G,6,* @,02-T%-*).(02$%.D%M&$$9+&$%$+3-#+&*#%*-*7-#(.* –!*2+*"++&*#%*/6"</%6"#*$+3+8-"#*7+,,-4+,*%"*#26:+$*-"&* 0-(+@%%1* –!%2$#,+%-G#H&(%9,*%D9)&E..F%IJ@U%2+*-//3G*).,-&,-% (&-(#&69/%9,*%V/-&(#,+*#%*@'63&*#.6,*(%$/',* #,D.(M9:.,%&'-(9):.,% a%$/',* 4+"+$-)%"* W2-02-T%-*$-"1+&*36,#*%0*(&0(&$&,-9:6&%-&(M$%* –!*2+*-//3G*@A%9,*%KLJ%%"*#.+*(%$/',*#%*-(.6+8+*#.6,*4%-3% a%"#+"#* -"-3G,6,*@G*!R*
  • 20. a%"#+"#*9"-3G,6,*86-*!R* a%"#+"#*9"-3G,6,*86-*!R* •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U% •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U% G&%D.//.G%-"&$&%$-&0$% G&%D.//.G%-"&$&%$-&0$T* –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+* –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+* •! A%1+"6,-)%"* H#%1+"6,-)%"O*UEF*#-446"4K* E/+"ZVU* •! UEF*#-446"4* –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#* –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#** H/.$-,+*(.'"16"4O*+")#G*$+(%4"6)%"K* •! /.$-,+*(.'"16"4* –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"** –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"*H#+$7* •! #+$7*$+(%4"6)%"* $+(%4"6)%"K* ]9AC* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! R.9/S*/$%(+,,*"-#'$-3*3-"4'-4+*#+>#*,'(.*#.-#*,/+(6D(* •! I,%&'9M0/&S*H3%(-#+&*6"*c&-#-?+>-7/3+,? 6"0%$7-)%"*(-"*@+*6&+")D+&* +>-7/3+[;#>#dK* –! A.+,+*/$%(+,,+,*6"(3'&+* •! F+"#+"(+*,+47+"#-)%"* c8..,&1%D9#/$%-.%&,*%+.9/%*(.2+"-3%P91,&%8..,&1e$% •! A%1+"6,-)%"* -(#0%-.%C.2-"%ID(#)9%NO>O%E&+9,%G#-"%"#+"% •! U-$#*%0*F/++(.*#-446"4* &'0&)-9:.,$%E2-%"&%/&96&$%G#-".2-%9%$#,+/&%+.9/% •! @,02-* $).(&*%9f&(%-"(&&%+(.20%M9-)"&$%9,*%9%>7g%*&D&9-% –! -*,6"43+*7+,,-4+* -.%R&(M9,13d* •! W2-02-* –! -*,+5'+"(+*%0*UEF*#-44+&*#%1+",*
  • 21. a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! C&,-&,)&%$&+M&,-9:.,% •! C&,-&,)&%$&+M&,-9:.,%2$#,+%W0&,KLJ* –! @,02-S*-*,6"43+*7+,,-4+* /* Input */ (LINE 17) –! W2-02-S*-*36,#*%0*,+"#+"(+,* String pathToInput = "../../data/examples/example1.txt"; String content = "…"; Rooney fails to end goal drought. | Wayne Rooney's trip to /* Creates an object of OpenNLP sentence segmentation detector */ South Africa 2010 began with high expectations but he SentenceDetector detector = new SentenceDetector("lib/opennlp/models/ EnglishSD.bin.gz"); leaves without a single goal scored after three group matches and a 1-4 defeat to Germany. /* Call the actual method to identify the end offsets of sentences. */ int[] result = detector.sentPosDetect(content); /* Print out the sentences */ Rooney fails to end goal drought. Wayne Rooney's Try it yourself! <*F+"#+"(+F+47+"#-)%";N-8-** int start=0, i=0; trip to South Africa 2010 began with high expectations but he leaves without a single goal do { scored after three group matches and a 1-4 defeat …… to Germany. } while(start<result[result.length-1]); a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! !.F&,#$9:.,% •! !.F&,#$9:.,%2$#,+%W0&,KLJ –! !"/'#S*-*,6"43+*,+"#+"(+O*%$*7+,,-4+* /* Input text message */ (LINE 28) String content = "…" // read in the text content from "example1.txt" –! E'#/'#S*-*36,#*%0*#%1+",* List<String> sentences = new ArrayList<String>(); …… /* Code for splitting sentences */ Rooney fails to end goal drought /*Creates an object of OpenNLPtokeniser using a pre-built English language model. */ //change the path accordingly String pathToEngTokenisationModel = "lib/opennlp/models/EnglishTok.bin.gz"; Rooney, fails, to, end, goal, drought, . Tokenizertokeniser tokeniser = new Tokenizer(pathToEngTokenisationModel); /*Tokenise each sentence and print out the result*/ Try it yourself! <*A%1+"6,-)%";N-8-** for(String sentence: sentences){ String[] result=tokeniser.tokenize(sentence); for(String tok:result) System.out.println(tok); Rooney fails to end goal drought. }
  • 22. a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! J9(-%.D%$0&&)"%-9++#,+% •! JWC%-9++#,+%2$#,+%W0&,KLJ* /*Input text message*/ (LINE 31) –! @,02-S*-*36,#*%0*#%1+",* String content = "…" //read in the text content from example1.txt –! W2-02-S*-*36,#*%0*#%1+",*26#.*#.+6$*/-$#*%0*,/++(.*#-4* List<String> tokens = new ArrayList<String>(); /* Code for tokenisation and add the result into the list object above. You do not need to do sentence segmentation in this case. Because the tokenisation will detect sentence boundary as a first step*/ Rooney, fails, to, end, goal, drought, . /*Creates an object of OpenNLP POS tagger using a pre-built English language model.*/ //change the path accordingly Rooney/NNP fails/VBZ to/TO end/VB goal/NN drought/ String pathToEngPOSModel = "lib/opennlp/models/tag.bin.gz"; /* You MAY specify additionally two parameters for the constructor, i.e., NN ./. TagDicionary and Dictionary.*/ PosTagger tagger = new PosTagger(pathToEngPOSModel, (Dictionary)null); Try it yourself! <*UEFA-44+$;N-8-** /*Tag the list of tokens and print out the result*/ String[] result=tagger.tag(tokens.toArray(new String[0])); goal/NN Rooney/NNP fails/VBZ to/TO end/VB drought/NN ./. for (String tag: result) System.out.println(tag); a%"#+"#*9"-3G,6,**U.$-,+*a.'"16"4* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! R.9/S*6&+")0G6"4*6"0%$7-)%"*'"6#,*#.-#*7-1+*4%%&* •! J"(9$&%)"2,F#,+% (-"&6&-#+*#+$7,*%0*%'$*6"#+$+,#* –! @,02-S*-*36,#*%0*JWC7-9++&*%-.F&,$% •! !"*#.6,*+>+$(6,+O*2+*0%(',*%"*,.2,%0"(9$&$% –! W2-02-S*-*36,#*%0*/.$-,+,*H"%'",?8+$@*/.$-,+,K* –! 2.6(.*%|+"*@+-$*67/%$#-"#*&%7-6"<,/+(6D(* 6"0%$7-)%"* Rooney/NNP fails/VBZ to/TO end/VB goal/NN drought/ NN ./. •! @,02-* –! UEF<#-44+&*#%1+",* •! W2-02-* Rooney, goal drought –! Z%'"*/.$-,+,* R>+$(6,+* Try it yourself! %*+&6#*#.+*(3-,,*U.$-,+a.'"1+$;N-8-*-"&*$'"*
  • 23. a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ* •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ* (LINE 44 in PhraseChunker.java) int[] result = detector.sentPosDetect(content); int start = 0, i = 0; (LINE 32 in PhraseChunker.java) do { //initilising all required NLP processors, If you get an out of memory //sentence splitting //exception, try increasing your JVM heap space to at least 256MB String sentence = content.substring(start, result[i]); String pathToEngTokenisationModel = "lib/opennlp/models/EnglishTok.bin.gz"; //TODO: tokenization, put tokens in a String array. Hint: String pathToEngPOSModel = "lib/opennlp/models/tag.bin.gz"; //Tokenisation.java String pathToEngPhraseModel = "lib/opennlp/models/EnglishChunk.bin.gz"; String[] tokens = null; //TODO: POS tagging, put tags in a String array. Hint: POSTagger.java SentenceDetector detector = new SentenceDetector("lib/opennlp/models/ EnglishSD.bin.gz"); String[] tags = null; Tokenizertokeniser = new Tokenizer(pathToEngTokenisationModel); //This is the method you use to chunk phrases on a list of tokens and PosTagger tagger = new PosTagger(pathToEngPOSModel, (Dictionary) null); //a list of tags String[] phrases = chunker.chunk(tokens, tags); TreebankChunkerchunker = new TreebankChunker(pathToEngPhraseModel); //See the result for(String p:phrases) System.out.println(p); …… start = result[i]; i++; } while (start < result[result.length - 1]); a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ* •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ* (LINE 44 in PhraseChunker.java) (LINE 78 in PhraseChunker.java) int[] result = detector.sentPosDetect(content); String npstart = "B-NP"; a%&+*0$%7*36"+*It*%"2-$&,*/$%(+,,+,* int start = 0, i = 0; String vpstart = "B-VP"; #.6,*$+,'3#*-"&*4+"+$-#+,*#.+*$+-3* do { A.+*$+,'3#*6,*"%#*+>-(#3G*#.+*/.$-,+,*2+* String npcontinue = "I-NP"; /.$-,+,* //sentence splitting +>/+(#+&O*@'#*-*36,#*%0*c#-4,dO*2.6(.*-$+* String vpcontinue = "I-VP"; (%77%"3G*',+&*6"*ZVU*/.$-,+* String sentence = content.substring(start, result[i]); String other = "O"; //TODO: tokenization, put tokens in (.'"16"4S* array. a String String phrase = ""; String[] tokens=null; for (int n = 0; n < tokens.length; n++) { X<ZU*****C%%"+G * *C%%"+G* //TODO: POStagging, put tags in a String array. Hint: POSTagger.java if (phrases[n].equals(npstart) || phrases[n].equals(vpstart)) { String[]–tags = null; B “begin” X<}U******0-63,* phrase = tokens[n]; //ThisI is“inside” – the method you use to chunk phrases on a list of tokens and for (int m = n + 1; m < tokens.length; m++) { !<}U*******#% * * *0-63,*#%*+"&* //a list – “Noun phrase” NP of tags if (phrases[m].equals(npcontinue) || !<}U*******+"&* String[] phrases phrase” VP – “Verb = chunker.chunk(tokens, tags); X<ZU*****4%-3* phrases[m].equals(vpcontinue)) { //See the result !<ZU******&$%'4.# * for (int k = 0; k < phrases.length; k++) { *4%-3*&$%'4.#* phrase = phrase+" "+tokens[m]; } else { System.out.println(phrases[k] + "tt" + tokens[k]); System.out.println("Actual phrase: "+phrase); } phrase = ""; …… break; start = result[i]; ... i++; } } while (start < result[result.length - 1]);
  • 24. a%"#+"#*9"-3G,6,**Z-#'$-3*V-"4'-4+*9"-3G,6,* g%$+*+>+$(6,+,*60*G%'*-$+*6"#+$+,#+&* •! J"(9$&%)"2,F#,+%2$#,+%W0&,KLJ% –! A.+*-",2+$b;* •! C+/+-#*/$+86%',*#-,1,*',6"4*#.+*(%$/',*4+"+$-#+&* (LINE 44 in PhraseChunker.java) ',6"4*#.+*#26:+$*-"&*0-(+@%%1*9U!,* int[] result = detector.sentPosDetect(content); int start = 0, i = 0; •! A$GS* do { //sentence splitting –! F+"#+"(+*,+47+"#-)%"* String sentence = content.substring(start, result[i]); //TODO: tokenization, put tokens in a String array. B-NP Rooney –! A%1+"6,-)%"* String[] tokens=tokeniser.tokenize(sentence); B-VP //TODO: pos tagging, put tags in a String array. fails –! U-$#<%0<,/++(.*#-446"4* I-VP to String[] tags = tagger.tag(tokens); I-VP on a list of tokens //This is the method you use to chunk phrases end –! U.$-,+*(.'"16"4* //and a list of tags B-NP goal String[] phrases = chunker.chunk(tokens,I-NP tags); drought //See the result O . for(String p:phrases) Actual phrase: Rooney System.out.println(p); Actual phrase: fails to end …… Actual phrase: goal drought start = result[i]; i++; } while (start < result[result.length - 1]); Z+>#* Q%7-6"*A+$7*C+(%4"6)%"* •! !.%9,9/1$&%-"&%).,-&,-%9,*%&'-(9)-%#M0.(-9,-%-&(M$U% •! R.9/S*+>#$-(#*,#-),)(-33G*,64"6D(-"#*#+$7,O*2.6(.* G&%D.//.G%-"&$&%$-&0$% (%33+()8+3G*&+#+$76"+*#.+*,'77-$G*%0*#.+*7-#(.* –! Z-#'$-3*3-"4'-4+*-"-3G,+,*%0*+-(.*7+,,-4+* •! 8&)90T**&%7-6"*#+$7*$+(%4"6)%"*/$%(+&'$+* H#%1+"6,-)%"O*UEF*#-446"4K* –! KLJ%0(.)&$$&$%#%*6&+")0G*(-"&6&-#+*3+>6(%",O*+;4;O* –! !&+")0G*(-"&6&-#+*6"0%$7-)%"*'"6#,*%0*6"#+$+,#* "%'"</.$-,+,O*+"))+,* H/.$-,+*(.'"16"4O*+")#G*$+(%4"6)%"K* –! C-9:$:)9/%M&9$2(&$%#%*+8-3'-#+*#.+*,64"6D(-"(+*%0* (-"&6&-#+*3+>6(%",* –! !&+")0G*,#-),)(-33G*67/%$#-"#*6"0%$7-)%"*H#+$7* •! #+$7*0$+5'+"(G~*•<6&0~*2+6$&"+,,O*43%,,+>O*(<8-3'+O* $+(%4"6)%"K* #+$7+>*
  • 25. Q%7-6"*A+$7*C+(%4"6)%"* ]9AC**]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#* •! R.9/S*+>#$-(#*,#-),)(-33G*,64"6D(-"#*#+$7,O*2.6(.* •! ;I!8%% (%33+()8+3G*&+#+$76"+*#.+*,'77-$G*%0*#.+*7-#(.* –! ]-8-<@-,+&*#%%316#*0%$*&+8+3%/6"4*-"&*#+,)"4*&%7-6"* •! 8&)90T**&%7-6"*#+$7*$+(%4"6)%"*/$%(+&'$+* #+$7*$+(%4"6)%"*-34%$6#.7,* –! KLJ%0(.)&$$&$%#%*6&+")0G*(-"&6&-#+*3+>6(%",O*+;4;O* •! B$&%;I!8%-.% "%'"</.$-,+,O*+"))+,* –! +>#$-(#*&%7-6"*#+$7,*0$%7*-*(%33+()%"*%0* –! C-9:$:)9/%M&9$2(&$%#%*+8-3'-#+*#.+*,64"6D(-"(+*%0* &%('7+"#,* (-"&6&-#+*3+>6(%",* •! J*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&* •! #+$7*0$+5'+"(G~*•<6&0~*2+6$&"+,,O*43%,,+>O*(<8-3'+O* –! 67/3+7+"#*-&&6)%"-3*-34%$6#.7,* #+$7+>* –! +8-3'-#+**&6{+$+"#*-34%$6#.7,*'"&+$*#.+*,-7+* 0$-7+2%$1* ]9AC**]-8-*9'#%7-)(*A+$7*C+(%4"6)%"*#%%316#* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! ;I!8%6,*-*]-8-<@-,+&*#%%316#*0%$*&+8+3%/6"4*-"&* •! ;I!8%* #+,)"4*&%7-6"*#+$7*$+(%4"6)%"*-34%$6#.7,* –! @-,6(*D(&h2&,)1%M&9$2(&* !"*#.+*0%33%26"4*+>+$(6,+O*G%'*2633*',+* •! B$&%;I!8%-.% #.+,+*-34%$6#.7,*-"&*(%7/-$+*#.+*$+,'3#,** –! J*-&&6)%"-3*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&% –! +>#$-(#*&%7-6"*#+$7,*0$%7*-*(%33+()%"*%0* •! A+$7*0$+5'+"(G*6"8+$,+*&%('7+"#*0$+5'+"(G*H•<6&0K* &%('7+"#,* •! a<}-3'+* @-(14$%'"&*6,*(%8+$+&*6"* •! J*,#-#+<%0<#.+<-$#*-34%$6#.7,*67/3+7+"#+&* •! `+6$&"+,,* #.+%$G*,36&+,*H€[vl<*[IvK* –! 67/3+7+"#*-&&6)%"-3*-34%$6#.7,* •! T3%,,-$G*+>#$-()%"*HT3%,,+>K* –! +8-3'-#+**&6{+$+"#*-34%$6#.7,*'"&+$*#.+*,-7+* •! A+$7*+>#$-(#%$*HA+$7+>K* 0$-7+2%$1* •! (-"*@+*',+&*-,*-*(%77-"&<36"+*@-,+&*-//36(-)%"* 9&8-"(+&*#%/6(**2633*@+* (%8+$+&*@$6+zG*
  • 26. Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! ^.G%-.%2$&%;I!8%* •! ^.G%-.%2$&%;I!8%* –! V%(-#+*G%'$*]9AC*0%3&+$* –! ,#-$#*#.+*-//36(-)%"O*+;4;O*#.+*0$+5'+"(G*7+-,'$+* uk.ac.shef.wit.jatr.debug.TestFrequency –! (%"D4'$+*G%'$*-//36(-)%"*6"*Z9-(30(.0&(:&$*6"* pG%'$=N-#$q?#+,#* –! 26#.*76"67'7*7+7%$G*•7>J[^7* •! N-#$;,G,#+7;"3/rpG%'$=N-#$q?"3/=$+,%'$(+,* –! (%/G*N-#$*-"&*3%4Y*/$%/+$)+,*D3+,*#%*G%'$*(3-,,+,*0%3&+$* *9((+,,*#%*ZVU*#%%3,*$+5'6$+&*@G*]9AC* -|+$*(3+-"*-"&*$+(%7/63+* •! N-#$;,G,#+7;#+$7;7->2%$&,rJ* –! ,++*D3+*L5'6(1,#-$#;#>#M*6"*]9AC*0%3&+$*0%$*-&&6)%"-3* *g->67'7*"'7@+$*%0*2%$&,*6"*-*#+$7* 6"0%$7-)%"* •! N-#$;,G,#+7;#+$7;64"%$+=&646#,r#$'+* *a-"*-*#+$7*(%"#-6"*&646#,k %% –! #+,#*26#.*#.+*#26:+$*-"&*0-(+@%%1*(%$/%$-* –! (%/G*/$%/+$)+,*D3+,*#%*pG%'$=N-#$q?(3-,,+,* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* 82,,#,+%-"&%-&$-$%G#-"%9,-% 82,,#,+%-"&%-&$-$%G#-"%9,-* •! -"#*,($6/#*,+#*'/*#%*$'"*0$%7*0%3&+$*pG%'$=N-#$q?#+,#* •! &+0-'3#*-$4'7+"#,* –! /-#.=#%=(%$/',*r*0%3&+$*6"*pG%'$=N-#$q?#+,#?w)"Gw* –! /-#.=#%=$+0+$+"(+=(%$/',=,#-#,*r*pG%'$=N-#$q?w"3/=$+,%'$(+,? @"(='"60$5,;"%$7-3w* •! #%*',+*-3#+$"-)8+*-$4,*+"#+$*%"+*%$*@%#.*%0* –! ant -Dpath_to_corpus=alt_corpus_path -Dpath_to_reference_corpus_stats=alt_reference_corpus_stats_path •! %'#/'#*#%*#+,#*0%3&+$* –! /-:+$"S*!"#$%&'()*+),*B*9AC=9VTEC!Ang;#>#* –! %$*$'"*9VV*#+,#,*@G*(-336"4*LI/+.(#-"M!&$-&(M*
  • 27. Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! B,*&($-9,*#,+%-"&%.2-02-* •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi* –! A.+*/$%(+,,*%0*#.+*-//36(-)%"*6,*3%44+&*6"*cN-#$;3%4d* –! 3%%1*-#*$'"HK*7+#.%&*6"* –! A.+*$+,'3#,*-$+*%'#/'#*#%*-*D3+*(-33+&** uk.ac.shef.wit.jatr.debug.TestFrequency.java *p-34%$6#.7="-7+q=9AC=934%$6#.7;#>#O*+;4;O* cF67/3+=#+$7=0$+5'+"(G=9AC=9VTEC!Ang;#>#d* Part 1: Extracting candidate terms by NLP9*c,#%/*2%$&d*36,#*6,* ',+&*#%*$+7%8+*"%6,+* –! 2.6(.*(%"#-6",*$-"1+&*36,#*%0*#+$7,*+>#$-(#+&*0$%7*#.+* //stop word list 2%$&,O*+;4;O*L#.+MO*L-"&M* StopList stop = new StopList(true); (%$/',O*%"+*#+$7*/+$*36"+S* //lemmatiser 2%$3&('/*‚`ECVQaeU*‚`%$3&a'/*‚2%$3&('/*‚`%$3&('/ ***[u^Y;_* Lemmatiser lemmatizer = new Lemmatiser(); V+77-),-)%"*6,*',+&*#%* //noun phrase extractor "%$7-36,+*#+$7,*#%*#.+6$* (-"%"6(-3*0%$7,*H,++*#.+%$G* A.+*D$,#*#+$7* CandidateTermExtractornpextractor = new A.+*"'7@+$*6,* ,36&+,*[ll<*[lvK* 6,*#.+* A.+*%#.+$*#+$7,*-$+* NounPhraseExtractorOpenNLP(stop, lemmatizer); #.+*(-3('3-#+&* (-"%"6(-3*0%$7* #.+*8-$6-"#,*0%'"&*6"* ,(%$+*0%$*#.-#* …… %0*-33*%0*6#,* #.+*(%$/',* #+$7** ]9AC*',+,*-*&+0-'3#*%/+"<"3/* 8-$6-"#,** @-,+&*"%'"*/.$-,+*(.'"1+$*#%* +>#$-(#*(-"&6&-#+*#+$7,* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi* •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi* –! 3%%1*-#*$'"HK*7+#.%&*6"* U$%(+,,%$,* Part 1: Extracting candidate terms by NLP cont. uk.ac.shef.wit.jatr.debug.TestFrequency.java TermFreqCounter npcounter = new TermFreqCounter();$+5'6$+&*0%$* (%'")"4*#+$7* WordCounter wordcounter = new WordCounter(); 0$+5'+"(6+,* Rooney, fails, to, end, goal, drought, . //create global resource index builder, which indexes 9*c,#%/*2%$&d*36,#* Part 1: Extracting candidate terms by NLP global resources, //stop word list 6,*',+&*#%*$+7%8+* "%6,+*2%$&,* //such as documents and terms and their relations StopList stop = new StopList(true); GlobalResourceIndexBuilder builder = new //lemmatiser GlobalResourceIndexBuilder(); Lemmatiserlemmatizer = new Lemmatiser(); V+77-),-)%"*6,*',+&*#%* //build the global resource index //noun phrase extractor "%$7-36,+*#+$7,*#%*#.+6$* GlobalResourceIndex termDocIndex = builder.build(new W%$*6"&+>6"4*#+$7,* (-"%"6(-3*0%$7,*H,++*#.+%$G* CandidateTermExtractornpextractor = new CorpusImpl(args[0]), npextractor); -"&*&%('7+"#,* ,36&+,*[ll<*[lvK* NounPhraseExtractorOpenNLP(stop, lemmatizer); …. …… !"8%16"4*ZVU*/$%(+,,+,*#%*$+-&*6"* ]9AC*',+,*-*&+0-'3#*%/+"<"3/* &%('7+"#,O*,+47+"#*,+"#+"(+,O* @-,+&*"%'"*/.$-,+*(.'"1+$*#%* -//3G*#%1+"6,-)%"O*UEF*#-446"4O* +>#$-(#*(-"&6&-#+*#+$7,* -"&*/.$-,+*(.'"16"4*
  • 28. Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! B,*&($-9,*#,+%-"&%0(.)&$$%5%".G%*.&$%#-%G.(Fi* •! ^.G%-.%2$&%;I!8%*,'77-$G* Part 2: Apply statistical analyse on extracted terms. –! A%*$'"*%#.+$*-34%$6#.7,O*$+/3-(+*#.+*-34%$6#.7*#+,#+$*(3-,,* FeatureCorpusTermFrequency termCorpusFreq = 6"*#.+*(%77-"&S** a$+-#+*0+-#'$+,* new FeatureBuilderCorpusTermFrequency(npcounter, $+5'6$+&*@G*#.6,* *N-8-*p7+7%$G=(%"D4q*<(3-,,/-#.*p-33=N-$=D3+,q* wordcounter, lemmatizer).build(termDocIndex); /-$)('3-$* '1;-(;,.+0;26#;N-#$;&+@'4;p&+,6$+&=-34%$6#.7=#+,#+$q* -34%$6#.7* AlgorithmTester tester = new AlgorithmTester(); *p/-#.=#%=%'$=2%$3&=('/=(%$/',q* tester.registerAlgorithm(new FrequencyAlgorithm(), new –! F%'$(+*(%&+*-"&*N-8-&%(*-$+*-8-63-@3+* FrequencyFeatureWrapper(termCorpusFreq)); –! F%7+*-34%$6#.7,*7-G*-,1*0%$*-"*-&&6)%"-3*/-$-7+#+$*6"* tester.execute(termDocIndex); #.+*(%77-"&S*p/-#.=#%=$+0+$+"(+=(%$/',=,#-#,q* System.out.println("Ended at: " + new Date()); a$+-#+*-"*6",#-"(+*%0*#.+* -34%$6#.7*0%$*#+,)"4*-"&* •! #.+,+*-34%$6#.7,*',+*$+0+$+")-3*(%$/',*,#-),)(,*#%*(%7/'#+* 36"1*#%*6#,*$+5'6$+&* #.+*c#+$7"+,,d* !"8%1+*,#-),)(-3*-"-3G,6,O*6;+;O* 0+-#'$+,* •! ',+*#.+*D3+*c@"(='"60$5,;"%$7-3d*H,#-#,*%0*#.+*X$6),.* (%7/'#+*#.+*,(%$+*',6"4*#.+* -34%$6#.76(*0%$7'3-* Z-)%"-3*a%$/',K*'"&+$*cpG%'$=N-#$q?"3/=$+,%'$(+,q*.+$+* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* •! j.(&%&'&()#$&$* •! I*69,)&*%-.0#)%5%*&6&/.0M&,-%2$#,+%;I!8* –! A$G*-33*#.+*-34%$6#.7,*%"*#.+*&6{+$+"#*(%$/%$-* –! A%*&+8+3%/*"+2*-34%$6#.7,*',6"4*]9ACO*G%'*7',#* /$%86&+&*0%$*G%'* •! !7/3+7+"#*G%'$*%2"*-34%$6#.7O*67/3+7+")"4* •! ',6"4*#.+*`%$3&*a'/*(%$/',S*(%$/',*H#26:+$*P*0-(+@%%1K* #.+*6"#+$0-(+* •! #$G*#.+*`616/+&6-*(%$/',*0%$*-$)(3+,*-@%'#*-"67-3,*&-#-* uk.ac.shef.wit.jatr.core.algorithm.Algorithm H-"67-3(%$/',K* •! !7/3+7+"#*G%'$*%2"*-34%$6#.7*0+-#'$+* –!(%7/-$+*#.+*+{+(#*%0*&6{+$+"#*(%$/%$-*%"*-(('$-(G* 2$-//+$* –!#%*0+#(.*0+-#'$+,*$+5'6$+&*@G*G%'$*-34%$6#.7* –!G%'$*(3-,,*7',#*+>#+"&* uk.ac.shef.wit.jatr.core.algorithm.Abstrac tFeatureWrapper •! 36,#*%0*+>-7/3+,*(-"*@+*0%'"&*6"*#.+*/-(1-4+* uk.ac.shef.wit.jatr.core.algorithm
  • 29. Q%7-6"*A+$7*C+(%4"6)%"*',6"4*]9AC* A.+*R"&*<*F'77-$G* •! I*69,)&*%-.0#)%5%*&6&/.0M&,-%2$#,+%;I!8* •! @,%-"#$%&'&()#$&%G&%"96&* –! !7/3+7+"#*"+2*0+-#'$+,*<*0%$*+-(.*($+-#+** –! V+-$"#*#%*',+*#.+*0-(+@%%1*-"&*#26:+$*9U!,** •! -*"+2*(3-,,*+>#+"&6"4* •! #%*(%33+(#*6"#+$+,)"4*&-#-*0%$*,/+(6D(*-//36(-)%"* uk.ac.shef.wit.jatr.core.feature.AbstractFeature* /'$/%,+* •! -"%#.+$*(3-,,*+>#+"&6"4** uk.ac.shef.wit.jatr.core.feature.AbstractFeature –! V+-$"#*#%*',+*E/+"ZVU** Builder •! #%*/+$0%$7*@-,6(*ZVU*#-,1,* –! -*36,#*%0*+>-7/3+,*(-"*@+*0%'"&*6"*#.+*/-(1-4+* –! V+-$"#*#%*',+*]9AC** uk.ac.shef.wit.jatr.core.feature •! #%*/+$0%$7*&%7-6"*#+$7*$+(%4"6)%"*0$%7*-*(%$/',* –! A$G*%#.+$*ZVU*#%%3,* –! A+,#+&*E/+"ZVU*-"&*]9AC*%"** –! a$+-#+*G%'$*%2"*7+#.%&,*0%$*+>#$-()"4*(-"&6&-#+*#+$7,*H+;4;O* •! -*0-(+@%%1*(%$/',** "<4$-7*6",#+-&*%0*"%'"*/.$-,+,K* •! -*#26:+$*(%$/',* •! ,++*uk.ac.shef.wit.jatr.core.npextractor A.+*R"&**W6"-3*`%$&,* A.+*R"&**9*X64*A.-"1*i%'h* •! =,.G/&*+&%9)h2#$#:.,%D(.M%$.)#9/% ,&-G.(F#,+%$#-&$%#$%)"9//&,+#,+* –! R>+$(6,+,*,.%2*#.-#*#,D.(M9/%/9,+29+&%-"&*$".(-U% -&($&%M&$$9+&$*(-',+*6"-(('$-(6+,*6"*$+,'3#,* –! A.6,*6,*.%2+8+$*"%#*-*$+-3<2%$3&*-//36(-)%"* A.-"1*G%'*8+$G*7'(.*0%$* •! 3-$4+$*&-#-*,+#,O*7%$+*8-$6+&*&-#-*"+(+,,-$G*#%* -:+"&6"4*#.6,*#'#%$6-3h* -//$+(6-#+*0'33*,(-3+*%0*(.-33+"4+,* –!n%2*#%*D3#+$*$+-3*',+0'3*#+$7,*0$%7*#.+*$+,'3#* -((%$&6"4*#%*',+$*6"#+$+,#k* –!n%2*#%*36"1*#.+*#+$7,*#%*#.+6$*(%"#+>#*,%*#.+G*7-1+* ,+",+k* –!-"&*7-"G*7%$+*5'+,)%",*#%*(%",6&+$b*