1 /** 2 * @author Amit Kumar 3 * @date 12/10/2008 4 * @extends Workbench.component.Component 5 * @description This Component displays the similar documents. 6 */ 7 8 Monk.component.SimilarityComponent = function(args) { 9 Monk.component.SimilarityComponent.superclass.constructor.call(this, args); 10 } 11 12 13 14 Workbench.extend(Monk.component.SimilarityComponent , Workbench.component.Component, { 15 label : "Find Similar Documents", 16 description : "This component is for displaying similar documents", 17 "window" : this.window, 18 handle: function(monkEvent, data){ 19 20 21 if (monkEvent.instanceOf(Monk.event.project.WorksetsReceived)) { 22 Ext.getCmp("dirty").setVisible(false); 23 }else if (monkEvent.instanceOf(Monk.event.workset.WorksetSaved) || 24 monkEvent.instanceOf(Monk.event.workset.WorksetDeleted) || 25 monkEvent.instanceOf(Monk.event.workset.WorksetCreated)) { 26 // reload data 27 Monk.data.project.getWorksets(Monk.component.dataManager.getProjectId()); 28 }else if (monkEvent.instanceOf(Monk.event.chunk.ChunkChecked) || 29 monkEvent.instanceOf(Monk.event.chunk.ChunkChecked) || 30 monkEvent.instanceOf(Monk.event.workset.WorksetReset) || 31 monkEvent.instanceOf(Monk.event.workset.WorksetCreated)) { 32 // warn about dirty data 33 Ext.getCmp("dirty").setVisible(true); 34 }else if(monkEvent.instanceOf(Monk.event.CollectionsMetadataHierarchyLoaded)){ 35 // alert("Now load the collection data"); 36 }else if(monkEvent.instanceOf(Monk.event.chunk.ChunkContentsRetrieved)){ 37 } 38 }, 39 // part of speech is stored here 40 posListStore : new Ext.data.GroupingStore({ 41 url: "../../../resources/data/poslist.xml", 42 reader : new Ext.data.XmlReader({record : 'wordclass', id : 'value'}, 43 new Ext.data.Record.create([{name : 'id', mapping : 'value'}, 'label', 'description','select'])), 44 autoLoad: true, 45 sortInfo:{field: 'label', direction: "ASC"}, 46 groupField: 'description' 47 }), 48 featureListStore: new Ext.data.JsonStore({ 49 url: Monk.data.PROXY_URL + "get/AnalyticsManager.getMoreLikeThisWorkset", 50 root: 'features', 51 totalProperty: 'numFeatures', 52 id: 'tag', 53 fields: [ 54 {name: 'term', mapping: 'term'}, 55 {name: 'field', mapping: 'field'}, 56 {name: 'score', mapping: 'score', type: 'float'}, 57 {name: 'idf', mapping: 'idf', type: 'float'}, 58 {name: 'index_freq', mapping: 'index_freq', type: 'float'}, 59 {name: 'source_freq', mapping: 'source_freq', type: 'float'} 60 ] 61 ,baseParams: {maxFeatures : '-1',limit:20,feature:'lemma', onlyFeatures: true} 62 ,remoteSort: false 63 }), 64 filterLemmaStore:new Ext.data.SimpleStore({ 65 fields: ['id','label','label'] 66 , data : [['2','2',"2"], 67 ["3","3","3"],["4","4","4"], 68 ["5","5","5"],["6","6","6"], 69 ["7","7","7"],["8","8","8"] 70 ] 71 }), 72 featureLenStore:new Ext.data.SimpleStore({ 73 fields: ['id','label','label'] 74 , data : [['2','2',"2"], 75 ["3","3","3"],["4","4","4"], 76 ["5","5","5"],["6","6","6"], 77 ["7","7","7"],["8","8","8"] 78 ] 79 }), 80 featureFreqStore:new Ext.data.SimpleStore({ 81 fields: ['id','label','label'] 82 , data : [['2','2',"2"], 83 ["3","3","3"],["4","4","4"], 84 ["5","5","5"],["6","6","6"], 85 ["7","7","7"],["8","8","8"] 86 ] 87 }), 88 featureFreqDocumentStore:new Ext.data.SimpleStore({ 89 fields: ['id','label','label'] 90 , data : [['2','2',"2"], 91 ["5","5","5"],["10","10","10"], 92 ["15","15","15"],["20","20","20"], 93 ["25","25","25"],["50","50","50"] 94 ] 95 }), 96 97 corpusStore:new Ext.data.SimpleStore({ 98 fields: ['id','label','label'] 99 , data : [['*','All Collections',"All Collections"], 100 ["ncf","19th Century Fiction","NCF"], 101 ["eebo","Early English Books Online","EEBO"], 102 ["sha","Shakespeare","SHA"], 103 ["eaf","Early American Fiction","EAF"], 104 ["wright","Wright Collection","WRIGHT"] 105 ] 106 }), 107 featureStore : new Ext.data.SimpleStore({ 108 fields: ['id','label'], 109 data : [['lemma','Lemma'],['spelling','Spelling']] 110 }), 111 resultTpl: new Ext.XTemplate( 112 '<tpl for=".">', 113 '<div class="search-item" onclick="similarityComponent.showDocument(\'{tag}\');">', 114 '<h4><span id="searchItem-{tag}" >{title} {workTitle} score: <i>{score}</i></span></h4> ', 115 '<hr/>', 116 '</div></tpl>' 117 ), 118 119 // called when user selects Lemma or Spellign 120 chooseFeature :function(){ 121 122 var featureVal=Ext.getCmp("feature").getValue(); 123 if (featureVal == "spelling") { 124 Ext.getCmp("ignorePosList").hide(); 125 Ext.getCmp("minWordLen").enable(); 126 }else if (featureVal == "lemma") { 127 Ext.getCmp("minWordLen").disable(); 128 if (Ext.getCmp("findInWorkset").getValue()) { 129 Ext.getCmp("ignorePosList").show(); 130 } 131 else { 132 Ext.getCmp("ignorePosList").hide(); 133 } 134 } 135 }, 136 137 // get the list of POS classes selected 138 getSelectedPOSList: function(){ 139 var selectedRecords = Ext.getCmp("ignorePosList").selModel.getSelections(); 140 var selectedKeyString; 141 for (i=0;i<selectedRecords.length;i++) { 142 // console.info("**"+selectedRecords[i].id); 143 if(i==0){ 144 selectedKeyString =selectedRecords[i].id; 145 }else{ 146 selectedKeyString = selectedKeyString+","+ selectedRecords[i].id; 147 } 148 } 149 return selectedKeyString; 150 }, 151 displayConcordance: function(params){ 152 this.notify(new Monk.event.chunk.FeatureSelected({ 153 label: 'Text feature selected: '+'"'+featureInstance+'"' 154 }), 155 params 156 ); 157 }, 158 /* 159 getDocumentList: function(){ 160 var documentArray = new Array(this.similarityStore.getCount()); 161 //console.info("total count of documents: "+ this.similarityStore.getCount() +" " + this.similarityStore.getTotalCount()); 162 var i=0; 163 this.similarityStore.each(function(record,options){ 164 // console.info("calling get document list... " + record.data.score + " " + record.data.tag); 165 documentArray[i] = record.data.tag; 166 i++; 167 },this); 168 169 return documentArray; 170 }, 171 */ 172 getFeatures : function(){ 173 // debugger; 174 var button = Ext.get('featurebutton'); 175 //console.info("in get features...1"); 176 var selectedKeyString = this.getSelectedPOSList(); 177 //reset featurelist 178 this.featureListStore.removeAll(); 179 180 var featureVal = Ext.getCmp("feature").getValue(); 181 var worksetIdVal = Monk.component.dataManager.getWorksetId(); 182 var minTermFreqVal = Ext.getCmp("minTermFreq").getValue(); 183 var minWordLenVal = Ext.getCmp("minWordLen").getValue(); 184 var minDocFreqVal = Ext.getCmp("minDocFreq").getValue(); 185 var d1 = Monk.component.dataManager.getWorklist();//this.store.getById(worksetIdVal).data; 186 if (d1.length==0) { 187 return Monk.component.messenger. 188 alert('Monk Workbench', 'Please select a workset that contains text chunks.', 189 this.window.parent ? this.window.parent.window : this.window); 190 } 191 192 if( Ext.getCmp("selectCorpus").isValid() && !Ext.getCmp("findInWorkset").getValue()){ 193 var corpus=Ext.getCmp("selectCorpus").getValue(); 194 this.featureListStore.on('beforeload',function(store,options){ 195 if (!selectedKeyString) { 196 store.baseParams = { 197 worksetId: worksetIdVal, 198 feature: featureVal, 199 corpusTag: corpus, 200 findInCorpus: true, 201 aggregateDocuments: true, 202 minTermFreq: minTermFreqVal, 203 minDocFreq: minDocFreqVal, 204 minWordLen: minWordLenVal, 205 onlyFeatures: true 206 }; 207 }else { 208 store.baseParams = { 209 worksetId: worksetIdVal, 210 feature: featureVal, 211 corpusTag: corpus, 212 findInCorpus: true, 213 aggregateDocuments: true, 214 minTermFreq: minTermFreqVal, 215 minDocFreq: minDocFreqVal, 216 minWordLen: minWordLenVal, 217 onlyFeatures: true, 218 ignoreList: selectedKeyString 219 }; 220 } 221 222 },this); 223 }else{ 224 this.featureListStore.on('beforeload',function(store,options){ 225 if(!selectedKeyString){ 226 store.baseParams={worksetId: worksetIdVal, feature: featureVal 227 ,minTermFreq:minTermFreqVal,minDocFreq:minDocFreqVal, 228 minWordLen:minWordLenVal,onlyFeatures:true}; 229 }else{ 230 store.baseParams={worksetId: worksetIdVal, feature: featureVal 231 ,minTermFreq:minTermFreqVal,minDocFreq:minDocFreqVal, 232 minWordLen:minWordLenVal,onlyFeatures:true, 233 ignoreList: selectedKeyString}; 234 } 235 },this); 236 237 } 238 this.featureListStore.reload({params:{worksetId:worksetIdVal, feature:featureVal , 239 maxDocuments : '-1',start:0, limit:20, onlyFeatures:true}}); 240 this.win.show(button); 241 }, 242 243 process: function(){ 244 // debugger; 245 var featureVal = Ext.getCmp("feature").getValue(); 246 var worksetIdVal = Monk.component.dataManager.getWorksetId(); 247 var minTermFreqVal = Ext.getCmp("minTermFreq").getValue(); 248 var minWordLenVal = Ext.getCmp("minWordLen").getValue(); 249 var minDocFreqVal = Ext.getCmp("minDocFreq").getValue(); 250 251 252 /*this.similarityStore.removeAll();*/ 253 254 var selectedKeyString = null; 255 if(featureVal=="lemma"){ 256 selectedKeyString =this.getSelectedPOSList(); 257 } 258 259 var d1 = Monk.component.dataManager.getWorklist(); 260 if (d1.length==0) { 261 return Monk.component.messenger. 262 alert('Monk Workbench', 'Please select a workset that contains text chunks.', 263 this.window.parent ? this.window.parent.window : this.window); 264 } 265 266 Workbench.console.info("here do the similarity..."); 267 var baseParams=null; 268 if( Ext.getCmp("selectCorpus").isValid() && 269 !Ext.getCmp("findInWorkset").getValue()){ 270 // looking for similar documents in the collection 271 var corpus=Ext.getCmp("selectCorpus").getValue(); 272 baseParams={ 273 worksetId: worksetIdVal, 274 feature: featureVal, 275 corpusTag:corpus, 276 findInCorpus:true, 277 aggregateDocuments:true, 278 minTermFreq:minTermFreqVal, 279 minDocFreq:minDocFreqVal, 280 minWordLen:minWordLenVal, 281 format:'xml' 282 } 283 }else{ 284 if (selectedKeyString != null){ 285 baseParams = { 286 worksetId: worksetIdVal, 287 feature: featureVal, 288 minTermFreq: minTermFreqVal, 289 minDocFreq: minDocFreqVal, 290 minWordLen: minWordLenVal, 291 ignoreList: selectedKeyString, 292 format:'xml' 293 }; 294 }else{ 295 baseParams = { 296 worksetId: worksetIdVal, 297 feature: featureVal, 298 minTermFreq: minTermFreqVal, 299 minDocFreq: minDocFreqVal, 300 minWordLen: minWordLenVal, 301 format:'xml' 302 }; 303 } 304 } 305 306 Workbench.component.manager.notify(new Monk.event.workbench.SimilarSearchQuery({ 307 label: 'similar search query: ' }),baseParams); 308 309 }, 310 311 312 showDocument: function(docId) { 313 this.notify(new Monk.event.chunk.ChunkSelected({ 314 label: 'Text workpart selected: '+'"'+docId+'"' 315 }), 316 {id: docId, text: docId, displayText:true} 317 ); 318 }, 319 320 init : function() { 321 //Workbench.console.info(this.posListStore); 322 var combo = { 323 xtype: 'combo', 324 forceSelection : true, 325 store : this.store, 326 valueField : 'id', 327 displayField : 'label', 328 mode : 'local', 329 allowBlank : false, 330 maxHeight: 75 331 332 } 333 334 var dirtySpan= { 335 id : 'dirty', 336 html : '<span class="dirty">Please save the current workset to see up-to-date data.</span>', 337 border : true, 338 colspan:2 339 }; 340 341 var messageBox= { 342 id: 'msg1', 343 html :'<div class="message">Select a workset and then choose if you want to find test documents similar ' + 344 ' to the training set or if you want corpus documents that are most similar' + 345 'to the workset (aggregates both the training and test documents).</div>', 346 border :true, 347 hidden: true, 348 colspan:2 349 }; 350 351 // custom column plugin 352 var checkColumn = new Ext.grid.CheckColumn({ 353 header: "select?", 354 dataIndex: 'select', 355 width: 55 356 }); 357 var cc=new Ext.grid.CheckboxSelectionModel(); 358 359 360 var featureForm= { 361 xtype : 'form' 362 ,id : 'similarity_form' 363 ,border:false 364 ,defaults: { 365 // applied to each contained panel 366 bodyStyle:'padding:20px' 367 } 368 ,labelAlign : 'right' 369 ,labelWidth: 150 370 ,monitorValid: true 371 ,items : [ 372 // display a radio button here that would allow user to either choose 373 // the same workset and discover similar documents in the test set 374 { 375 id : 'findInWorkset' 376 ,labelAlign : 'right' 377 ,fieldLabel :'In Unrated workset documents' 378 ,xtype: 'checkbox' 379 ,forceSelection: true 380 ,checked: true 381 382 ,listeners : { 383 check : { 384 fn: function(cmp, isChecked){ 385 //enable the corpus combo box here' 386 var selectCorpus = Ext.getCmp("selectCorpus"); 387 selectCorpus.allowBlank = isChecked; 388 selectCorpus.setDisabled(isChecked); 389 selectCorpus.validate(); 390 Ext.getCmp("ignorePosList").setVisible(isChecked && Ext.getCmp("feature").getValue()=="lemma"); 391 /* 392 if (isChecked) 393 var featureVal=Ext.getCmp("feature").getValue(); 394 if(Ext.getCmp("findInWorkset").getValue()){ 395 Ext.getCmp("selectCorpus").disable(); 396 if(featureVal=="lemma") 397 Ext.getCmp("ignorePosList").show(); 398 else 399 Ext.getCmp("ignorePosList").hide(); 400 401 }else{ 402 Ext.getCmp("selectCorpus").enable(); 403 Ext.getCmp("ignorePosList").hide(); 404 } 405 */ 406 } 407 ,scope : this 408 } 409 } 410 ,border:true 411 }, 412 413 // corpus selection disabled by default 414 { 415 id :'selectCorpus' 416 ,allQuery: '*' 417 ,labelAlign : 'right' 418 ,fieldLabel : 'In the corpus' 419 ,blankText : 'Select Corpus' 420 ,emptyText: 'Select Corpus' 421 , xtype: 'combo' 422 ,valueField: 'id' 423 ,displayField : 'label' 424 ,disabled: false 425 ,editable: false 426 ,triggerAction: 'all' 427 ,store : this.corpusStore 428 ,mode : 'local' 429 ,border:true 430 }, 431 // feature list 432 { 433 id: 'feature' 434 ,labelAlign : 'right' 435 ,fieldLabel : 'Feature Type' 436 ,blankText : 'Select Feature' 437 ,emptyText: 'Select Feature' 438 , xtype: 'combo' 439 ,valueField: 'id' 440 ,displayField : 'label' 441 ,disabled: false 442 ,editable: false 443 ,allowBlank: false 444 , triggerAction: 'all' 445 ,store : this.featureStore 446 ,value: 'spelling' 447 ,mode : 'local' 448 ,listeners : { 449 select : {fn: this.chooseFeature} 450 } 451 }, 452 { 453 id: 'minWordLen' 454 ,labelAlign : 'right' 455 ,fieldLabel : 'Min. length (Characters)' 456 ,blankText : 'Select Minimum Feature Length' 457 ,emptyText: 'Select Minimum Feature Length' 458 ,xtype: 'combo' 459 ,valueField: 'id' 460 ,displayField : 'label' 461 ,disabled: false 462 ,editable: true 463 ,value: 5 464 ,triggerAction: 'all' 465 ,store : this.featureLenStore 466 ,mode : 'local' 467 ,allowBlank: false 468 ,regex: /^\d+$/ 469 ,regexText: 'Please specify a numeric value.' 470 471 }, 472 473 { 474 id: 'minTermFreq' 475 ,labelAlign : 'right' 476 ,fieldLabel : 'Feature should occur at least times in the training documents' 477 ,blankText : 'Select Feature Frequency' 478 ,emptyText: 'Select Feature Frequency' 479 , xtype: 'combo' 480 ,valueField: 'id' 481 ,displayField : 'label' 482 ,disabled: false 483 ,editable: true 484 ,value: 3 485 ,triggerAction: 'all' 486 ,store : this.featureFreqStore 487 ,mode : 'local' 488 ,allowBlank: false 489 ,regex: /^\d+$/ 490 ,regexText: 'Please specify a numeric value.' 491 }, 492 493 494 { 495 id: 'minDocFreq' 496 ,labelAlign : 'right' 497 ,fieldLabel : 'Feature should occur in at least these many training documents ' 498 ,blankText : 'Select Feature Document Frequency' 499 ,emptyText: 'Select Feature Document Frequency' 500 , xtype: 'combo' 501 ,valueField: 'id' 502 ,displayField : 'label' 503 ,disabled: false 504 ,editable: true 505 ,value: 2 506 ,triggerAction: 'all' 507 ,store : this.featureFreqDocumentStore 508 ,mode : 'local' 509 ,allowBlank: false 510 ,regex: /^\d+$/ 511 ,regexText: 'Please specify a numeric value.' 512 }, 513 { 514 id: 'ignorePosList' 515 ,bodyStyle: 'padding: none;' 516 , xtype:"editorgrid" 517 ,store : this.posListStore 518 ,clicksToEdit:1 519 //,plugins:checkColumn 520 ,columns: [ cc, 521 {id:'id',header: "id", width:75 , sortable: true, align: 'left',dataIndex: 'id'}, 522 // {header: "label", width: 75, sortable: true,align: 'left', dataIndex: 'label'}, 523 {header: "description", width: 75, sortable: true, align: 'left',dataIndex: 'description',hidden:true}, 524 {header: "label", width: 75, sortable: true, align: 'left',dataIndex: 'label'} 525 ], 526 sm: cc, 527 stripeRows: true, 528 autoExpandColumn: 'id', 529 height:200, 530 width:250, 531 cls: "ignorePOSClass", 532 title:'Ignore Following Word Class', 533 view: new Ext.grid.GroupingView({ 534 forceFit:true, 535 groupTextTpl: '{text} ({[values.rs.length]} {[values.rs.length > 1 ? "Items" : "Item"]})' 536 }) 537 } 538 539 ], 540 buttons: [ 541 // the go button here 542 { 543 id: 'gobutton' 544 ,xtype: 'button' 545 ,fieldLabel: 'Click to Search.' 546 ,text: 'find similar documents' 547 ,tooltip: 'click here to find similar documents' 548 ,formBind: true 549 ,listeners :{ 550 click :{fn : this.process, scope: this} 551 } 552 553 554 }, 555 // the feature button here 556 { 557 id: 'featurebutton' 558 ,xtype: 'button' 559 ,fieldLabel: 'Click to Get Features.' 560 ,text: 'find features' 561 ,tooltip: 'click here to find document features' 562 ,formBind: true 563 ,listeners :{ 564 click :{fn : this.getFeatures, scope: this} 565 } 566 } 567 ] 568 }; 569 // end of the form 570 571 572 var featureFormPanel= new Ext.Panel({ 573 title : 'Search for Similar Documents' 574 ,layout:'fit' 575 ,border : false 576 ,autoScroll : true 577 ,border: false 578 ,items : [ 579 // span that indicates if the workset is dirty 580 dirtySpan, 581 messageBox, 582 // the workset select box 583 featureForm 584 ] 585 }) 586 Monk.data.project.getWorksets(Monk.component.dataManager.getProjectId()); 587 588 589 590 591 592 var mainPanel= new Ext.TabPanel({ 593 activeTab: 0, 594 autoWidth:true, 595 height:600, 596 plain:true, 597 layoutOnTabChange: true, 598 defaults:{autoScroll: true}, 599 items:[ 600 featureFormPanel 601 /* resultPanel*/ 602 ] 603 }); 604 605 606 var featureListPagingBar = new Ext.PagingToolbar({ 607 pageSize:25, 608 store: this.featureListStore, 609 displayInfo: true, 610 displayMsg: 'Displaying features {0} - {1} of {2}', 611 emptyMsg: "No features to display", 612 items:[ 613 '-' 614 /*{ 615 pressed: true, 616 enableToggle:true, 617 text: 'Show Preview', 618 cls: 'x-btn-text-icon details', 619 toggleHandler: function(btn, pressed){ 620 var view = grid.getView(); 621 view.showPreview = pressed; 622 view.refresh(); 623 } 624 }*/ 625 ] 626 }); 627 628 var selectionModel = new Ext.grid.RowSelectionModel({ 629 singleSelect: true 630 }); 631 632 633 // create the feature Grid 634 var grid = new Ext.grid.GridPanel({ 635 store: this.featureListStore, 636 sm: selectionModel, 637 columns: [ 638 {id:'term',header: "Term", width: 160, sortable: true, dataIndex: 'term'}, 639 {header: "score", width: 75, sortable: true, dataIndex: 'score',type: 'float'}, 640 {header: "idf", width: 75, sortable: true, dataIndex: 'idf',type: 'float'}, 641 {header: "Count in Test Docs", width: 75, sortable: true, dataIndex: 'index_freq',type: 'float'}, 642 {header: "Count in Training Docs", width: 85, sortable: true, dataIndex: 'source_freq',type: 'float'} 643 ], 644 stripeRows: true, 645 autoExpandColumn: 'term', 646 height:350, 647 width:600, 648 viewConfig: {forceFit: true}, 649 autoWidth: true, 650 title:'Feature List', 651 bbar: featureListPagingBar, 652 listeners: { 653 rowclick:function(grid,rowIndex,event){ 654 var record = grid.getSelectionModel().getSelected(); 655 var featureInstance = record.get('term'); 656 var posTest = /\(.+?\)$/; 657 var type="spelling"; 658 if (featureInstance.match(posTest)) { 659 type="lemma"; 660 }else{ 661 type="spelling"; 662 } 663 664 var params = { 665 term: featureInstance, 666 type: type 667 }; 668 669 Workbench.component.manager.notify(new Monk.event.chunk.FeatureSelected({ 670 label: 'Text feature selected: '+'"'+featureInstance+'"' 671 }), params); 672 673 674 675 } 676 677 678 } 679 }); 680 681 682 var viewport = new Ext.Viewport({ 683 layout: 'fit', 684 renderTo: document.body, 685 items: [mainPanel] 686 }); 687 688 689 690 this.win = new Ext.Window({ 691 title: 'List of Features', 692 applyTo : 'feature-win', 693 layout : 'fit', 694 collapsible:true, 695 width : 500, 696 height : 300, 697 closeAction :'hide', 698 plain : true, 699 items : [grid] 700 }); 701 702 703 Ext.getCmp("ignorePosList").hide(); 704 705 706 707 } 708 709 710 711 }); 712