{"id":820570,"date":"2022-02-16T19:17:25","date_gmt":"2022-02-17T03:17:25","guid":{"rendered":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/?post_type=msr-research-item&#038;p=820570"},"modified":"2022-02-16T19:17:25","modified_gmt":"2022-02-17T03:17:25","slug":"mmpt21-international-joint-workshop-on-multi-modal-pre-training-for-multimedia-understanding","status":"publish","type":"msr-research-item","link":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/publication\/mmpt21-international-joint-workshop-on-multi-modal-pre-training-for-multimedia-understanding\/","title":{"rendered":"MMPT&#8217;21: International Joint Workshop on Multi-Modal Pre-Training for Multimedia Understanding"},"content":{"rendered":"<p>Pre-training has been an emerging topic that provides a way to learn strong representation in many fields (eg, natural language processing, computing vision). In the last few years, we have witnessed many research works on multi-modal pre-training which have achieved state-of-the-art performances on many multimedia tasks (eg, image-text retrieval, video localization, speech recognition). In this workshop, we aim to gather peer researchers on related topics for more insightful discussion. We also intend to attract more researchers to explore and investigate more opportunities of designing and using innovative pre-training models for multimedia tasks.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Pre-training has been an emerging topic that provides a way to learn strong representation in many fields (eg, natural language processing, computing vision). In the last few years, we have witnessed many research works on multi-modal pre-training which have achieved state-of-the-art performances on many multimedia tasks (eg, image-text retrieval, video localization, speech recognition). In this [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"","msr_publisher_other":"","msr_booktitle":"","msr_chapter":"","msr_edition":"","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"","msr_organization":"","msr_pages_string":"","msr_page_range_start":"694","msr_page_range_end":"695","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"ICMR 2021","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":"","msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2021-8-24","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":0,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":0,"footnotes":""},"msr-research-highlight":[],"research-area":[13562,13551],"msr-publication-type":[193716],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[],"msr-conference":[],"msr-journal":[],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-820570","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-computer-vision","msr-research-area-graphics-and-multimedia","msr-locale-en_us"],"msr_publishername":"","msr_edition":"","msr_affiliation":"","msr_published_date":"2021-8-24","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"url","viewUrl":"false","id":"false","title":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3460426.3470947","label_id":"243106","label":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[],"msr-author-ordering":[{"type":"user_nicename","value":"Bei Liu","user_id":38889,"rest_url":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Bei Liu"},{"type":"user_nicename","value":"Jianlong Fu","user_id":32260,"rest_url":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Jianlong Fu"},{"type":"text","value":"Shizhe Chen","user_id":0,"rest_url":false},{"type":"text","value":"Qin Jin","user_id":0,"rest_url":false},{"type":"text","value":"Alexander Hauptmann","user_id":0,"rest_url":false},{"type":"text","value":"Yong Rui","user_id":0,"rest_url":false}],"msr_impact_theme":[],"msr_research_lab":[199560],"msr_event":[],"msr_group":[144916],"msr_project":[212087],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"inproceedings","related_content":{"projects":[{"ID":212087,"post_title":"Vision and Language","post_name":"video-and-language","post_type":"msr-project","post_date":"2016-01-14 20:03:50","post_modified":"2021-05-13 02:20:33","post_status":"publish","permalink":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/project\/video-and-language\/","post_excerpt":"Automatically describing visual content with natural language is a fundamental challenge of computer vision and multimedia. Sequence learning (e.g., Recurrent Neural Networks), attention mechanism, memory networks,\u00a0etc.,\u00a0have attracted increasing attention on visual interpretation. In this project, we are focusing on the following topics related to the emerging topic of \"vision and language\": Image and video captioning, including MSR-VTT video to language grand challenge and datasets (http:\/\/ms-multimedia-challenge.com\/). Image and video commenting (conversation) Visual storytelling (e.g., generation of&hellip;","_links":{"self":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/212087"}]}}]},"_links":{"self":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/820570","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":1,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/820570\/revisions"}],"predecessor-version":[{"id":820585,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/820570\/revisions\/820585"}],"wp:attachment":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/media?parent=820570"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=820570"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=820570"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=820570"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=820570"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=820570"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=820570"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=820570"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=820570"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=820570"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=820570"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=820570"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=820570"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}