{"id":145138,"date":"2003-01-01T00:00:00","date_gmt":"2003-01-01T00:00:00","guid":{"rendered":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/msr-research-item\/on-evaluation-and-training-set-construction-for-duplicate-detection\/"},"modified":"2018-10-16T20:10:35","modified_gmt":"2018-10-17T03:10:35","slug":"on-evaluation-and-training-set-construction-for-duplicate-detection","status":"publish","type":"msr-research-item","link":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/publication\/on-evaluation-and-training-set-construction-for-duplicate-detection\/","title":{"rendered":"On Evaluation and Training-Set Construction for Duplicate Detection"},"content":{"rendered":"<p>A variety of experimental methodologies have been used to evaluate the accuracy of duplicate-detection systems. We advocate presenting precision-recall curves as the most informative evaluation methodology. We also discuss a number of issues that arise when evaluating and assembling training data for adaptive systems that use machine learning to tune themselves to speci\ufb01c applications. We consider several different application scenarios and experimentally examine the effectiveness of alternative methods of collecting training data under each scenario. We propose two new approaches to collecting training data called static-active learning and weakly-labeled non-duplicates, and present experimental results on their effectiveness.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>A variety of experimental methodologies have been used to evaluate the accuracy of duplicate-detection systems. We advocate presenting precision-recall curves as the most informative evaluation methodology. We also discuss a number of issues that arise when evaluating and assembling training data for adaptive systems that use machine learning to tune themselves to speci\ufb01c applications. We [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"","msr_publisher_other":"","msr_booktitle":"Proceedings of the KDD-2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation","msr_chapter":"","msr_edition":"Proceedings of the KDD-2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"","msr_organization":"","msr_pages_string":"7\u201312","msr_page_range_start":"","msr_page_range_end":"","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"Proceedings of the KDD-2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"Mikhail Bilenko, Raymond J. Mooney","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":"","msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2003-01-01","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":2003,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":0,"footnotes":""},"msr-research-highlight":[],"research-area":[13560],"msr-publication-type":[193716],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[],"msr-conference":[],"msr-journal":[],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-145138","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-programming-languages-software-engineering","msr-locale-en_us"],"msr_publishername":"","msr_edition":"Proceedings of the KDD-2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation","msr_affiliation":"","msr_published_date":"2003-01-01","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"Proceedings of the KDD-2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation","msr_pages_string":"7\u201312","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"210394","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"file","title":"03-marlin-kdd-wkshp.pdf","viewUrl":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2016\/02\/03-marlin-kdd-wkshp.pdf","id":210394,"label_id":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[{"id":210394,"url":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2016\/02\/03-marlin-kdd-wkshp.pdf"}],"msr-author-ordering":[{"type":"user_nicename","value":"mbilenko","user_id":32850,"rest_url":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=mbilenko"},{"type":"text","value":"Raymond J. Mooney","user_id":0,"rest_url":false}],"msr_impact_theme":[],"msr_research_lab":[199565],"msr_event":[],"msr_group":[],"msr_project":[],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"inproceedings","related_content":[],"_links":{"self":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/145138","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":1,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/145138\/revisions"}],"predecessor-version":[{"id":523670,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/145138\/revisions\/523670"}],"wp:attachment":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/media?parent=145138"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=145138"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=145138"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=145138"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=145138"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=145138"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=145138"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=145138"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=145138"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=145138"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=145138"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=145138"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=145138"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}