{"id":1035006,"date":"2024-06-04T11:04:19","date_gmt":"2024-06-04T18:04:19","guid":{"rendered":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/?post_type=msr-blog-post&#038;p=1035006"},"modified":"2024-06-04T11:04:21","modified_gmt":"2024-06-04T18:04:21","slug":"insights-into-the-challenges-and-opportunities-of-large-multi-modal-models-for-blind-and-low-vision-users-a-case-study-on-clip","status":"publish","type":"msr-blog-post","link":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/articles\/insights-into-the-challenges-and-opportunities-of-large-multi-modal-models-for-blind-and-low-vision-users-a-case-study-on-clip\/","title":{"rendered":"Insights into the Challenges and Opportunities of Large Multi-Modal Models for Blind and Low Vision Users: A Case Study on CLIP"},"content":{"rendered":"\n<p class=\"has-purple-color has-text-color has-link-color wp-elements-e12752b8e25cfcca425c88600532d3b3\"><em>Presented by&nbsp;<a href=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/people\/dmassiceti\/\">Daniela Massiceti<\/a><\/em> <em>at&nbsp;<strong>Microsoft Research Forum, June 2024<\/strong><\/em><\/p>\n\n\n\n<div class=\"wp-block-media-text has-vertical-margin-none  has-vertical-padding-none  is-stacked-on-mobile has-white-background-color has-background\" style=\"grid-template-columns:25% auto\"><figure class=\"wp-block-media-text__media\"><img loading=\"lazy\" decoding=\"async\" width=\"360\" height=\"360\" src=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/05\/Daniela-Massiceti_360x360.jpg\" alt=\"Daniela Massiceti\" class=\"wp-image-1035159 size-full\" srcset=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/05\/Daniela-Massiceti_360x360.jpg 360w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/05\/Daniela-Massiceti_360x360-300x300.jpg 300w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/05\/Daniela-Massiceti_360x360-150x150.jpg 150w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/05\/Daniela-Massiceti_360x360-180x180.jpg 180w\" sizes=\"auto, (max-width: 360px) 100vw, 360px\" \/><\/figure><div class=\"wp-block-media-text__content\">\n<blockquote class=\"wp-block-quote is-style-spectrum is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u201cToday&#8217;s AI models hold incredible potential for assisting the blind community\u2014from text recognition to object identification to question answering. Apps like Seeing AI are already deploying some of these AI features. But there is potential for much more.\u201d<\/p>\n<cite><em>\u2013<\/em> Daniela Massiceti, Senior Researcher, Microsoft Research Cambridge<\/cite><\/blockquote>\n<\/div><\/div>\n\n\n\n<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio\"><div class=\"wp-block-embed__wrapper\">\n<div class=\"yt-consent-placeholder\" role=\"region\" aria-label=\"Video playback requires cookie consent\" data-video-id=\"yCkF89SmARE\" data-poster=\"https:\/\/img.youtube.com\/vi\/yCkF89SmARE\/maxresdefault.jpg\"><iframe aria-hidden=\"true\" tabindex=\"-1\" title=\"Challenges and Opportunities of Large Multi-Modal Models for Blind and Low Vision Users: CLIP\" width=\"500\" height=\"281\" data-src=\"https:\/\/www.youtube-nocookie.com\/embed\/yCkF89SmARE?feature=oembed&rel=0&enablejsapi=1\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen><\/iframe><div class=\"yt-consent-placeholder__overlay\"><button class=\"yt-consent-placeholder__play\"><svg width=\"42\" height=\"42\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" aria-hidden=\"true\" focusable=\"false\"><g fill=\"none\" fill-rule=\"evenodd\"><circle fill=\"#000\" opacity=\".556\" cx=\"21\" cy=\"21\" r=\"21\"\/><path stroke=\"#FFF\" d=\"M27.5 22l-12 8.5v-17z\"\/><\/g><\/svg><span class=\"yt-consent-placeholder__label\">Video playback requires cookie consent<\/span><\/button><\/div><\/div>\n<\/div><\/figure>\n\n\n\n<div class=\"annotations \" data-bi-aN=\"margin-callout\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 annotations__list--right\">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=What%20insights%20did%20Daniela%20Massiceti%20share%20about%20the%20challenges%20and%20opportunities%20of%20multi-modal%20models%20like%20CLIP%20for%20blind%20and%20low%20vision%20users\" target=\"_blank\" aria-label=\"What insights did Daniela Massiceti share about the challenges and opportunities of multi-modal models like CLIP for blind and low vision users?\" data-bi-type=\"annotated-link\" data-bi-cN=\"What insights did Daniela Massiceti share about the challenges and opportunities of multi-modal models like CLIP for blind and low vision users?\" class=\"annotations__list-thumbnail\" >\n\t\t\t\t\t<img loading=\"lazy\" decoding=\"async\" width=\"172\" height=\"96\" src=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png\" class=\"mb-2\" alt=\"Ask Microsoft research copilot experience\" srcset=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png 240w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-300x169.png 300w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1024x576.png 1024w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-768x432.png 768w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1066x600.png 1066w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-655x368.png 655w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-343x193.png 343w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-640x360.png 640w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-960x540.png 960w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1280x720.png 1280w, https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo.png 1400w\" sizes=\"auto, (max-width: 172px) 100vw, 172px\" \/>\t\t\t\t<\/a>\n\t\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Microsoft research copilot experience<\/span>\n\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=What%20insights%20did%20Daniela%20Massiceti%20share%20about%20the%20challenges%20and%20opportunities%20of%20multi-modal%20models%20like%20CLIP%20for%20blind%20and%20low%20vision%20users\" data-bi-cN=\"What insights did Daniela Massiceti share about the challenges and opportunities of multi-modal models like CLIP for blind and low vision users?\" target=\"_blank\" rel=\"noopener noreferrer\" data-external-link=\"true\" data-bi-aN=\"margin-callout\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>What insights did Daniela Massiceti share about the challenges and opportunities of multi-modal models like CLIP for blind and low vision users?<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-open-in-new-tab\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n\n\n<div class=\"wp-block-msr-show-more\">\n\t<div class=\"bg-neutral-100 p-5\">\n\t\t<div class=\"show-more-show-less\">\n\t\t\t<div>\n\t\t\t\t<span>\n\t\t\t\t\t\n\n<h3 class=\"wp-block-heading\" id=\"transcript-panel-discussion-transforming-the-natural-sciences-with-ai\">Transcript: Lightning Talk<\/h3>\n\n\n\n<p><strong>Insights into the Challenges and Opportunities of Large Multi-Modal Models for Blind and Low Vision Users: A Case Study on CLIP<\/strong><\/p>\n\n\n\n<p><strong>Daniela Massiceti<\/strong>, Senior Researcher, Microsoft Research Cambridge<\/p>\n\n\n\n<p>Daniela Massiceti delves into the transformative potential of multimodal models such as CLIP for assistive technologies. Specifically focusing on the blind\/low-vision community, the talk explores the current distance from realizing this potential and the advancements needed to bridge this gap.<\/p>\n\n\n\n<p>Microsoft Research Forum, June 4, 2024<\/p>\n\n\n\n\t\t\t\t<\/span>\n\t\t\t\t<span id=\"show-more-show-less-toggle-1\" class=\"show-more-show-less-toggleable-content\">\n\t\t\t\t\t\n\n\n\n<p><strong>DANIELA MASSICETI:<\/strong><strong> <\/strong>Hi there. My name is Daniela Massiceti, and I&#8217;m a senior researcher at Microsoft Research Cambridge. Today, I will be sharing <a href=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/publication\/explaining-clips-performance-disparities-on-data-from-blind-low-vision-users\/\">our recent CVPR paper<\/a>, which examines the challenges and opportunities of large multi-modal models for blind and low-vision users.<\/p>\n\n\n\n<p>Today&#8217;s AI models hold incredible potential for assisting the blind community\u2014from text recognition to object identification to question answering. Apps like Seeing AI are already deploying some of these AI features. But there is potential for much more. And I think this is hinted at by the recent partnership between OpenAI and Be My Eyes, with the promise that one day, human assistance could be replaced by AI agents that provide instantaneous assistance to blind users around the world. But despite their potential, no works have really looked at, well, how well do these models actually work on image and text data captured by blind users? And we know from the literature that this data is likely to be out of distribution or different in a number of ways. For example, blind users use a range of quite specialized assistive objects. They also are more likely to capture images with quality variation, things like camera blur and occlusion. And they&#8217;re also more likely to make use of non-visual vocabulary, for example, describing their objects by their physical rather than their visual properties.<\/p>\n\n\n\n<p>Our work, therefore, set out to remedy this. Specifically, we systematically evaluated 25 variants of the CLIP model on data from blind and low-vision users. CLIP is one of today&#8217;s most widely used multi-modal models. It has over 15,000 citations and 75 million downloads. We used the ORBIT and the VizWiz-Classification datasets. Both of these are collected by blind users through real-world assistive applications. And we inspected CLIP&#8217;s performance on both a zero-shot image classification task directly as well as through examining the performance of models that use CLIP as a component, which is very widely done in the community. I unfortunately don&#8217;t have time to go into all the details of our work, but I will share our top three findings with you. First, we confirmed that CLIP does indeed underperform on data that is captured by blind and low-vision users. Second, these disparities trickle down to models that use CLIP as a component. And then third, these disparities stem from the fact that disability content is significantly underrepresented and sometimes missing completely from the datasets that are used to pretrain these large models. And I&#8217;ll dive into our three findings in a bit more detail.<\/p>\n\n\n\n<p>So for the first finding, we found that CLIP underperforms on objects, image quality, and language that is typically used by blind users. On object type, CLIP recognizes disability objects like a Braille keyboard, for example, up to 28 percentage points less accurately than common objects like a TV remote. On image quality, CLIP is up to 23 percentage points more sensitive to images with things like camera blur and lighting compared to images that don&#8217;t have these quality issues. And on language, CLIP recognizes objects that are described by their material\u2014so, for example, a <em>leather<\/em> boot\u2014up to 12 percentage points less accurately than objects described by their color\u2014for example, a <em>brown<\/em> boot. And we know that blind users rely heavily on this tactile rather than visual language.<\/p>\n\n\n\n<p>Towards our second finding, we examined three models that use CLIP under the hood\u2014an object detection model, an image segmentation model, and an image generation model\u2014and found that all three struggle with disability content. For example, DALL-E 2, which relies on a CLIP vision encoder, cannot generate common disability objects like guide canes and Braille keyboards. Instead, as you can see here, it gives us very strange-looking walking canes and lots and lots of randomly placed white dots. In comparison, DALL-E 2 generated really high-quality and realistic images for almost all of the non-disability objects that we tested.<\/p>\n\n\n\n<p>And then towards our third and final finding, we really wanted to understand where these performance disparities were stemming from. And so we quantified just how prevalent disability content is in three popular datasets that are commonly used to pretrain these large models: LAION-[400]Million, LAION-2 Billion, and the DataComp-1B dataset, or 1 billion dataset. Specifically, we counted how many times objects are mentioned in these datasets\u2019 captions and found that disability objects appear up to 16 to 17 times less frequently than non-disability objects across all three of the datasets.<\/p>\n\n\n\n<p>So as you can see, our work has identified a clear gap in current models\u2019 capabilities for blind users, and this could have very real consequences if these models are then integrated into assistive technologies for the blind and low-vision community. So what should we, as a research community, be doing about it? First, I think more work is needed to understand how models come to learn or adapt to long-tailed data. Some of our early results show that few-shot learning approaches hold some promise, but they don&#8217;t always work, especially in more challenging scenarios, for example, when objects appear in highly cluttered scenarios. And second, I think it&#8217;s important for us to really focus on including more disability content in these large-scale pretraining datasets. And our team [is] currently working on developing equitable and fair practices alongside disabled communities to source data that is truly representative of their needs. And so with that, I will wrap up.<\/p>\n\n\n\n<p>Thank you to all the people behind this work and thank you for listening.<\/p>\n\n\t\t\t\t<\/span>\n\t\t\t<\/div>\n\t\t\t<button\n\t\t\t\tclass=\"action-trigger glyph-prepend mt-2 mb-0 show-more-show-less-toggle\"\n\t\t\t\taria-expanded=\"false\"\n\t\t\t\tdata-show-less-text=\"Show less\"\n\t\t\t\ttype=\"button\"\n\t\t\t\taria-controls=\"show-more-show-less-toggle-1\"\n\t\t\t\taria-label=\"Show more content\"\n\t\t\t\tdata-alternate-aria-label=\"Show less content\">\n\t\t\t\tShow more\t\t\t<\/button>\n\t\t<\/div>\n\t<\/div>\n<\/div>\n\n\n\n<h2 class=\"wp-block-heading alignwide\" id=\"related-resources\">Related resources<\/h2>\n\n\n\n<div class=\"wp-block-columns alignwide are-vertically-aligned-top is-layout-flex wp-container-core-columns-is-layout-9d6595d7 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Research Lab<\/span>\n\t\t\t<a href=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/lab\/microsoft-research-cambridge\/\" data-bi-cN=\"Microsoft Research Lab \u2013 Cambridge\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Microsoft Research Lab \u2013 Cambridge<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Publication<\/span>\n\t\t\t<a href=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/publication\/explaining-clips-performance-disparities-on-data-from-blind-low-vision-users\/\" data-bi-cN=\"Explaining CLIP\u2019s performance disparities on data from blind\/low vision users\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Explaining CLIP\u2019s performance disparities on data from blind\/low vision users<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Event<\/span>\n\t\t\t<a href=\"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/event\/cvpr-2024\/\" data-bi-cN=\"Microsoft at CVPR 2024\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Microsoft at CVPR 2024<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>Daniela Massiceti delves into the transformative potential of multimodal models such as CLIP for assistive technologies. Specifically focusing on the blind\/low-vision community, the talk explores the current distance from realizing this potential and the advancements needed to bridge this gap.<\/p>\n","protected":false},"author":42735,"featured_media":1040058,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-content-parent":1034949,"msr_hide_image_in_river":0,"footnotes":""},"research-area":[],"msr-locale":[268875],"msr-post-option":[],"class_list":["post-1035006","msr-blog-post","type-msr-blog-post","status-publish","has-post-thumbnail","hentry","msr-locale-en_us"],"msr_assoc_parent":{"id":1034949,"type":"story"},"_links":{"self":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/1035006","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post"}],"about":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-blog-post"}],"author":[{"embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/users\/42735"}],"version-history":[{"count":8,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/1035006\/revisions"}],"predecessor-version":[{"id":1041906,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/1035006\/revisions\/1041906"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/media\/1040058"}],"wp:attachment":[{"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/media?parent=1035006"}],"wp:term":[{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=1035006"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=1035006"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/cm-edgetun.pages.dev\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=1035006"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}