oembed: Remove unsound HTML filtering.

The frontend now takes care of confining the HTML.

Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
Anders Kaseorg 2019-12-12 00:39:41 -08:00 committed by Tim Abbott
parent 8459185970
commit faa3ea0b8e
2 changed files with 11 additions and 27 deletions

View File

@ -1,4 +1,3 @@
from bs4 import BeautifulSoup, SoupStrainer
from typing import Optional, Dict, Any
from pyoembed import oEmbed, PyOembedException
@ -20,25 +19,16 @@ def get_oembed_data(url: str,
data['oembed'] = True
elif oembed_resource_type == 'video' and html and thumbnail:
data['html'] = get_safe_html(html)
data['html'] = strip_cdata(html)
data['image'] = thumbnail
# Add a key to identify oembed metadata as opposed to other metadata
data['oembed'] = True
return data
def get_safe_html(html: str) -> str:
"""Return a safe version of the oEmbed html.
Verify that the HTML:
1. has a single iframe
2. the src uses a schema relative URL or explicitly specifies http(s)
"""
def strip_cdata(html: str) -> str:
# Work around a bug in SoundCloud's XML generation:
# <html>&lt;![CDATA[&lt;iframe ...&gt;&lt;/iframe&gt;]]&gt;</html>
if html.startswith('<![CDATA[') and html.endswith(']]>'):
html = html[9:-3]
soup = BeautifulSoup(html, 'lxml', parse_only=SoupStrainer('iframe'))
iframe = soup.find('iframe')
if iframe is not None and iframe.get('src').startswith(('http://', 'https://', '//')):
return str(soup)
return ''
return html

View File

@ -14,7 +14,7 @@ from zerver.lib.test_helpers import MockPythonResponse
from zerver.worker.queue_processors import FetchLinksEmbedData
from zerver.lib.url_preview.preview import (
get_link_embed_data, link_embed_data_from_cache)
from zerver.lib.url_preview.oembed import get_oembed_data, get_safe_html
from zerver.lib.url_preview.oembed import get_oembed_data, strip_cdata
from zerver.lib.url_preview.parsers import (
OpenGraphParser, GenericParser)
from zerver.lib.cache import cache_set, NotFoundInCache, preview_url_cache_key
@ -116,22 +116,16 @@ class OembedTestCase(ZulipTestCase):
data = get_oembed_data(url)
self.assertIsNone(data)
def test_safe_oembed_html(self) -> None:
def test_oembed_html(self) -> None:
html = '<iframe src="//www.instagram.com/embed.js"></iframe>'
safe_html = get_safe_html(html)
self.assertEqual(html, safe_html)
def test_unsafe_oembed_html(self) -> None:
html = ('<blockquote class="instagram-media" data-instgrm-captioned>test</blockquote>\n'
'<script async src="//www.instagram.com/embed.js"></script>')
safe_html = get_safe_html(html)
self.assertEqual('', safe_html)
stripped_html = strip_cdata(html)
self.assertEqual(html, stripped_html)
def test_autodiscovered_oembed_xml_format_html(self) -> None:
iframe_content = '<iframe src="https://w.soundcloud.com/player"></iframe>'
html = '<![CDATA[{}]]>'.format(iframe_content)
safe_html = get_safe_html(html)
self.assertEqual(iframe_content, safe_html)
stripped_html = strip_cdata(html)
self.assertEqual(iframe_content, stripped_html)
class OpenGraphParserTestCase(ZulipTestCase):