diff --git a/zerver/lib/url_preview/oembed.py b/zerver/lib/url_preview/oembed.py index e1b66b4894..3f11f91dff 100644 --- a/zerver/lib/url_preview/oembed.py +++ b/zerver/lib/url_preview/oembed.py @@ -1,4 +1,3 @@ -from bs4 import BeautifulSoup, SoupStrainer from typing import Optional, Dict, Any from pyoembed import oEmbed, PyOembedException @@ -20,25 +19,16 @@ def get_oembed_data(url: str, data['oembed'] = True elif oembed_resource_type == 'video' and html and thumbnail: - data['html'] = get_safe_html(html) + data['html'] = strip_cdata(html) data['image'] = thumbnail # Add a key to identify oembed metadata as opposed to other metadata data['oembed'] = True return data -def get_safe_html(html: str) -> str: - """Return a safe version of the oEmbed html. - - Verify that the HTML: - 1. has a single iframe - 2. the src uses a schema relative URL or explicitly specifies http(s) - - """ +def strip_cdata(html: str) -> str: + # Work around a bug in SoundCloud's XML generation: + # <![CDATA[<iframe ...></iframe>]]> if html.startswith(''): html = html[9:-3] - soup = BeautifulSoup(html, 'lxml', parse_only=SoupStrainer('iframe')) - iframe = soup.find('iframe') - if iframe is not None and iframe.get('src').startswith(('http://', 'https://', '//')): - return str(soup) - return '' + return html diff --git a/zerver/tests/test_link_embed.py b/zerver/tests/test_link_embed.py index 157ea902e0..49fb96d8d2 100644 --- a/zerver/tests/test_link_embed.py +++ b/zerver/tests/test_link_embed.py @@ -14,7 +14,7 @@ from zerver.lib.test_helpers import MockPythonResponse from zerver.worker.queue_processors import FetchLinksEmbedData from zerver.lib.url_preview.preview import ( get_link_embed_data, link_embed_data_from_cache) -from zerver.lib.url_preview.oembed import get_oembed_data, get_safe_html +from zerver.lib.url_preview.oembed import get_oembed_data, strip_cdata from zerver.lib.url_preview.parsers import ( OpenGraphParser, GenericParser) from zerver.lib.cache import cache_set, NotFoundInCache, preview_url_cache_key @@ -116,22 +116,16 @@ class OembedTestCase(ZulipTestCase): data = get_oembed_data(url) self.assertIsNone(data) - def test_safe_oembed_html(self) -> None: + def test_oembed_html(self) -> None: html = '' - safe_html = get_safe_html(html) - self.assertEqual(html, safe_html) - - def test_unsafe_oembed_html(self) -> None: - html = ('
test
\n' - '') - safe_html = get_safe_html(html) - self.assertEqual('', safe_html) + stripped_html = strip_cdata(html) + self.assertEqual(html, stripped_html) def test_autodiscovered_oembed_xml_format_html(self) -> None: iframe_content = '' html = ''.format(iframe_content) - safe_html = get_safe_html(html) - self.assertEqual(iframe_content, safe_html) + stripped_html = strip_cdata(html) + self.assertEqual(iframe_content, stripped_html) class OpenGraphParserTestCase(ZulipTestCase):