twitter - How to find the number of documents (and fraction) per topic using LDA? -
i trying extract topic 7 millons of twitter data. have assumed each tweet document. so, stored tweets in file each line (or tweet) treated document. used file input file mallet api.
public static void ldamodel(int numofk,int numbofiteration,int numberofthread,string outputdir,instancelist instances) throws exception { // create model 100 topics, alpha_t = 0.01, beta_w = 0.01 // note first parameter passed sum on topics, while // second parameter single dimension of dirichlet prior. int numtopics = numofk; paralleltopicmodel model = new paralleltopicmodel(numtopics, 1.0, 0.01); model.addinstances(instances); // use 2 parallel samplers, each @ 1 half corpus , combine // statistics after every iteration. model.setnumthreads(numberofthread); // run model 50 iterations , stop (this testing only, // real applications, use 1000 2000 iterations) model.setnumiterations(numbofiteration); model.estimate(); // show words , topics in first instance // data alphabet maps word ids strings alphabet dataalphabet = instances.getdataalphabet(); featuresequence tokens = (featuresequence) model.getdata().get(0).instance.getdata(); labelsequence topics = model.getdata().get(0).topicsequence; formatter out = new formatter(new stringbuilder(), locale.us); (int position = 0; position < tokens.getlength(); position++) { // out.format("%s-%d ", dataalphabet.lookupobject(tokens.getindexatposition(position)), topics.getindexatposition(position)); out.format("%s-%d ", dataalphabet.lookupobject(tokens.getindexatposition(position)), topics.getindexatposition(position)); } system.out.println(out); // estimate topic distribution of first instance, // given current gibbs state. double[] topicdistribution = model.gettopicprobabilities(0); // array of sorted sets of word id/count pairs arraylist<treeset<idsorter>> topicsortedwords = model.getsortedwords(); // show top 10 words in topics proportions first document string topicsoutput=""; (int topic = 0; topic < numtopics; topic++) { iterator<idsorter> iterator = topicsortedwords.get(topic).iterator(); out = new formatter(new stringbuilder(), locale.us); out.format("%d\t%.3f\t", topic, topicdistribution[topic]); int rank = 0; while (iterator.hasnext() && rank < 10) { idsorter idcountpair = iterator.next(); out.format("%s (%.0f) ", dataalphabet.lookupobject(idcountpair.getid()), idcountpair.getweight()); //out.format("%s ", dataalphabet.lookupobject(idcountpair.getid())); rank++; } system.out.println(out); } // create new instance high probability of topic 0 stringbuilder topiczerotext = new stringbuilder(); iterator<idsorter> iterator = topicsortedwords.get(0).iterator(); int rank = 0; while (iterator.hasnext() && rank < 10) { idsorter idcountpair = iterator.next(); topiczerotext.append(dataalphabet.lookupobject(idcountpair.getid()) + " "); rank++; } // create new instance named "test instance" empty target , source fields. instancelist testing = new instancelist(instances.getpipe()); testing.addthrupipe(new instance(topiczerotext.tostring(), null, "test instance", null)); topicinferencer inferencer = model.getinferencer(); double[] testprobabilities = inferencer.getsampleddistribution(testing.get(0), 10, 1, 5); system.out.println("0\t" + testprobabilities[0]); file pathdir = new file(outputdir + file.separator+ "numoftopics"+numtopics); //fixme replace strings constants pathdir.mkdir(); string dirpath = pathdir.getpath(); string statefile = dirpath+file.separator+"output_state.gz"; string outputdoctopicsfile = dirpath+file.separator+"output_doc_topics.txt"; string topickeysfile = dirpath+file.separator+"output_topic_keys"; printwriter writer=null; string topickeysfile_fromprogram = dirpath+file.separator+"output_topic"; try { writer = new printwriter(topickeysfile_fromprogram, "utf-8"); writer.print(topicsoutput); writer.close(); } catch (exception e) { e.printstacktrace(); } model.printtopwords(new file(topickeysfile), 11, false); model.printdocumenttopics(new file (outputdoctopicsfile)); model.printstate(new file (statefile)); } public static void main(string[] args) throws exception{ // begin importing documents text feature sequences arraylist<pipe> pipelist = new arraylist<pipe>(); // pipes: lowercase, tokenize, remove stopwords, map features pipelist.add( new charsequencelowercase() ); pipelist.add( new charsequence2tokensequence(pattern.compile("\\p{l}[\\p{l}\\p{p}]+\\p{l}")) ); pipelist.add( new tokensequenceremovestopwords(new file("h:\\data\\stoplists\\en.txt"), "utf-8", false, false, false) ); pipelist.add( new tokensequence2featuresequence() ); instancelist instances = new instancelist (new serialpipes(pipelist)); reader filereader = new inputstreamreader(new fileinputstream(new file("e:\\thesis data\\dataforlda\\freshnewdata\\cleantweets.txt")), "utf-8"); instances.addthrupipe(new csviterator (filereader, pattern.compile("^(\\s*)[\\s,]*(\\s*)[\\s,]*(.*)$"), 3, 2, 1)); // data, label, name fields int numberoftopic=5; int numberofiteration=50; int numberofthread=6; string outputdir="j:\\topics\\"; //int numberoftopic=5; ldamodel(numberoftopic,numberofiteration,numberofthread,outputdir,instances); timeunit.seconds.sleep(30); numberoftopic=10; }
i have got 3 files above program. 1. state file 2. topic proportion file 3. key topic list
i find out number of documents allocated per topic. example got following output key topic list file
- 0.004 obama (5471) canada (5283) woman (5152) vote (4879) police(3965)
where first column means topic serial number, second column means topic weight, third column means words under topic (number of words)
here, got number of words under topic show number of documents got topic. helpful show output separate file this. example,
topic 1: doc1(80%) doc2(70%) .......
could please give idea or source code this? thanks.
the information looking contained in file "2. topic proportion" mentioned. note every document contains each topic percentage (although percentages may large 1 topic , extremly small others). have decide want extract file: dominant topic (it in column 3); dominant topic, when percentage @ least 50% (sometimes, 2 topics have same percentage) ...
Comments
Post a Comment