Module pyfavicon
Expand source code
__title__ = 'pyfavicon'
__version__ = '0.0.1'
__author__ = 'Bilal Elmoussaoui'
__license__ = 'MIT'
import os
import bs4
import binascii
import aiohttp
import yarl
import pathlib
from enum import Enum
import urllib
from PIL import ImageFile
import re
__all__ = ['Favicon', 'FaviconType', 'Icon', 'Icons']
LINK_RELS = [
'icon',
'shortcut icon',
'apple-touch-icon',
'apple-touch-icon-precomposed',
'fluid-icon'
]
META_NAMES = [
'msapplication-TileImage'
]
TAGS = [
{'name': 'link', 'attrs': {'rel': LINK_RELS, 'href': True}, 'attr': 'href'},
{'name': 'meta', 'attrs': {'name': META_NAMES, 'content': True}, 'attr': 'content'}
]
def parse_base64_icon(data: str) -> bytes:
if data:
_, data = data.split(":")
mimetype, data = data.split(",")
data = urllib.parse.unquote_to_bytes(data)
assert mimetype.endswith('base64')
return binascii.a2b_base64(data)
return None
class FaviconType(Enum):
URL = 0
DATA = 1
class Icon:
'''
The Icon object
Attributes:
size (int, int) : The dimensions of the favicon
extension (str) : The icon extension, .png, .ico...
type (FaviconType) : Whether the icon scheme is of type data or not.
link (yarl.URL) : The favicon URL
data (bytes) : The favicon image content
'''
def __init__(self, **kwargs):
self.link = kwargs.get("link")
self._size = None
self._extension = None
# If the icon of type FaviconType.DATA
self.data = parse_base64_icon(kwargs.get("data"))
self._path = None
self.type = FaviconType.DATA if self.data else FaviconType.URL
@staticmethod
async def new(source: str, url: yarl.URL):
'''
Create a new Icon from the source tag content.
Args:
source (str) : The source tag content;
url (yarl.URL) : The website URL
Returns:
Icon
'''
parsed_url = urllib.parse.urlparse(source)
if parsed_url.scheme != 'data':
fav_url = None
# Missing scheme
if not parsed_url.netloc:
if parsed_url.path.startswith(':'):
fav_url = yarl.URL(url.scheme + parsed_url.path)
else:
favicon_path = parsed_url.path
match_results = re.match(r'^([\/\.\/\:]+)(.+)$', favicon_path)
if match_results and len(match_results.groups()) == 2:
favicon_path = '/' + match_results.group(2)
else:
favicon_path = '/' + favicon_path
fav_url = yarl.URL.build(host=url.host,
path=favicon_path)
# Link look fine
elif parsed_url.netloc:
fav_url = yarl.URL.build(host=parsed_url.netloc,
path=parsed_url.path)
if parsed_url.scheme:
fav_url = fav_url.with_scheme(parsed_url.scheme)
else:
fav_url = fav_url.with_scheme(url.scheme)
icon = Icon(link=fav_url, website_url=url)
else: # Data scheme:
icon = Icon(data=source, website_url=url)
await icon.parse(url)
return icon
@property
def size(self) -> (int, int):
return self._size
@property
def path(self) -> pathlib.Path:
return self._path
@property
def extension(self) -> str:
return self._extension
def __str__(self):
return str(self.link)
async def save(self):
'''Save the icon
You can retrieve the favicon cached path using the path property.
'''
if os.path.exists(self.path):
return
buffer = b''
if self.type is FaviconType.DATA:
buffer = self.data
else:
async with aiohttp.ClientSession() as session:
response = await session.get(self.link,
headers=Favicon.HEADERS)
async for chunck, _ in response.content.iter_chunks():
buffer += chunck
with open(self.path, 'wb') as fd:
fd.write(buffer)
async def parse(self, website_url: yarl.URL):
try:
with ImageFile.Parser() as image_parser:
if self.type is FaviconType.DATA:
image_parser.feed(self.data)
else:
async with aiohttp.ClientSession() as session:
response = await session.get(self.link,
headers=Favicon.HEADERS)
async for chunk in response.content.iter_chunked(1024):
image_parser.feed(chunk)
if image_parser.image:
break
except OSError:
# PIL failed to decode the image
pass
# If PIL successfully decoded the image
if image_parser and image_parser.image:
self._size = image_parser.image.size
self._extension = image_parser.image.format.lower()
else:
self._size = (-1, -1)
self._generate_icon_name(website_url)
def _generate_icon_name(self, website_url: yarl.URL) -> str:
'''
Generate an icon name
Args:
website_url (yarl.URL): The website url
Returns:
str, the icon name
'''
# If we don't have a base64 data image.
from tempfile import NamedTemporaryFile, gettempdir
if website_url:
image_name = website_url.host
else:
image_name = os.path.basename(NamedTemporaryFile().name)
if self._size:
image_name += '_{}x{}'.format(*self._size)
if self.type is not FaviconType.DATA:
image_name += os.path.basename(self.link.path)
if Favicon.DOWNLOAD_DIR:
self._path = Favicon.DOWNLOAD_DIR.joinpath(image_name)
else:
self._path = pathlib.Path(gettempdir()).joinpath(image_name)
class Icons:
'''
Icons, contains a lot of Icon.
'''
def __init__(self, **kwargs):
self._data = []
self._current = 0
def get_largest(self, extension: str = None) -> Icon:
'''Get the largest icon
Args:
extension (str) : The required extension
Returns:
Icon
'''
icons = self._data
if extension:
icons = list(filter(lambda icon: icon.extension == extension,
icons))
if icons:
largest = max(icons, key=lambda icon: icon.size)
return largest
return None
def append(self, icon: Icon):
self._data.append(icon)
def __iter__(self):
return self
def __len__(self):
return len(self._data)
def __next__(self):
if self._current >= len(self._data):
raise StopIteration
else:
current = self._data[self._current]
self._current += 1
return current
def __str__(self):
return str(self._data)
def __getitem__(self, key):
return self._data[key]
class Favicon:
'''
The favicon manager object
Args:
download_dir (pathlib.Path) : The location to save the icons on
headers (dict) : The headers to send with each request
'''
DOWNLOAD_DIR = None
HEADERS = {}
def __init__(self, download_dir: pathlib.Path = None,
headers={}):
Favicon.HEADERS = headers
Favicon.DOWNLOAD_DIR = download_dir
async def from_url(self, url: str) -> Icons:
'''Fetch all the favicons from a URL
Args:
url (str) : The website url to load the favicons from
Returns:
Icons
'''
# Read the html content of the page async
favicons = Icons()
async with aiohttp.ClientSession() as session:
buffer = b''
response = await session.get(url, headers=Favicon.HEADERS)
async for chunk in response.content.iter_chunked(1024):
if not chunk:
break
buffer += chunk
html_content = buffer.decode("utf-8")
favicons = await self._find_favicons_links(html_content,
response.url)
return favicons
async def from_html(self, html_content: str, website_url: str = None) -> Icons:
'''Fetch all the favicons from an HTML content
Args:
html_content (str) : The HTML content.
website_url (str) : The website url, the source of the HTML file
Returns:
Icons
'''
website_url = yarl.URL(website_url) if website_url else None
favicons = await self._find_favicons_links(html_content,
website_url)
return favicons
async def from_file(self, html_file: pathlib.Path, website_url: str = None) -> Icons:
'''Fetch all the favicons from an HTML file.
Args:
html_file (pathlib.Path) : The HTML file path.
website_url (str) : The website url, the source of the HTML file
Returns:
Icons
'''
with html_file.open() as f:
html_content = f.read()
favicons = await self.from_html(html_content, website_url)
return favicons
async def _find_favicons_links(self, html_content: str,
url: yarl.URL = None) -> Icons:
'''Find the favicon links in a parsed HTML content/
Args:
html_content (str) : The HTML content.
url (yarl.URL) : The website url, the source of the HTML content
Returns:
Icons
'''
bsoup = bs4.BeautifulSoup(html_content, features="html.parser")
icons = Icons()
_added = []
for tag in TAGS:
sources = bsoup.find_all(tag['name'], attrs=tag['attrs'])
for elem in sources:
icon = await Icon.new(elem.attrs[tag['attr']], url=url)
if icon.link not in _added:
icons.append(icon)
_added.append(icon.link)
return icons
Classes
class Favicon (download_dir=None, headers={})
-
The favicon manager object
Args
download_dir (pathlib.Path) : The location to save the icons on
headers (dict) : The headers to send with each request
Expand source code
class Favicon: ''' The favicon manager object Args: download_dir (pathlib.Path) : The location to save the icons on headers (dict) : The headers to send with each request ''' DOWNLOAD_DIR = None HEADERS = {} def __init__(self, download_dir: pathlib.Path = None, headers={}): Favicon.HEADERS = headers Favicon.DOWNLOAD_DIR = download_dir async def from_url(self, url: str) -> Icons: '''Fetch all the favicons from a URL Args: url (str) : The website url to load the favicons from Returns: Icons ''' # Read the html content of the page async favicons = Icons() async with aiohttp.ClientSession() as session: buffer = b'' response = await session.get(url, headers=Favicon.HEADERS) async for chunk in response.content.iter_chunked(1024): if not chunk: break buffer += chunk html_content = buffer.decode("utf-8") favicons = await self._find_favicons_links(html_content, response.url) return favicons async def from_html(self, html_content: str, website_url: str = None) -> Icons: '''Fetch all the favicons from an HTML content Args: html_content (str) : The HTML content. website_url (str) : The website url, the source of the HTML file Returns: Icons ''' website_url = yarl.URL(website_url) if website_url else None favicons = await self._find_favicons_links(html_content, website_url) return favicons async def from_file(self, html_file: pathlib.Path, website_url: str = None) -> Icons: '''Fetch all the favicons from an HTML file. Args: html_file (pathlib.Path) : The HTML file path. website_url (str) : The website url, the source of the HTML file Returns: Icons ''' with html_file.open() as f: html_content = f.read() favicons = await self.from_html(html_content, website_url) return favicons async def _find_favicons_links(self, html_content: str, url: yarl.URL = None) -> Icons: '''Find the favicon links in a parsed HTML content/ Args: html_content (str) : The HTML content. url (yarl.URL) : The website url, the source of the HTML content Returns: Icons ''' bsoup = bs4.BeautifulSoup(html_content, features="html.parser") icons = Icons() _added = [] for tag in TAGS: sources = bsoup.find_all(tag['name'], attrs=tag['attrs']) for elem in sources: icon = await Icon.new(elem.attrs[tag['attr']], url=url) if icon.link not in _added: icons.append(icon) _added.append(icon.link) return icons
Class variables
var DOWNLOAD_DIR
var HEADERS
Methods
async def from_file(self, html_file, website_url=None)
-
Fetch all the favicons from an HTML file.
Args
html_file (pathlib.Path) : The HTML file path.
website_url (str) : The website url, the source of the HTML file
Returns
Icons
Expand source code
async def from_file(self, html_file: pathlib.Path, website_url: str = None) -> Icons: '''Fetch all the favicons from an HTML file. Args: html_file (pathlib.Path) : The HTML file path. website_url (str) : The website url, the source of the HTML file Returns: Icons ''' with html_file.open() as f: html_content = f.read() favicons = await self.from_html(html_content, website_url) return favicons
async def from_html(self, html_content, website_url=None)
-
Fetch all the favicons from an HTML content
Args
html_content (str) : The HTML content.
website_url (str) : The website url, the source of the HTML file
Returns
Icons
Expand source code
async def from_html(self, html_content: str, website_url: str = None) -> Icons: '''Fetch all the favicons from an HTML content Args: html_content (str) : The HTML content. website_url (str) : The website url, the source of the HTML file Returns: Icons ''' website_url = yarl.URL(website_url) if website_url else None favicons = await self._find_favicons_links(html_content, website_url) return favicons
async def from_url(self, url)
-
Fetch all the favicons from a URL
Args
url (str) : The website url to load the favicons from
Returns
Icons
Expand source code
async def from_url(self, url: str) -> Icons: '''Fetch all the favicons from a URL Args: url (str) : The website url to load the favicons from Returns: Icons ''' # Read the html content of the page async favicons = Icons() async with aiohttp.ClientSession() as session: buffer = b'' response = await session.get(url, headers=Favicon.HEADERS) async for chunk in response.content.iter_chunked(1024): if not chunk: break buffer += chunk html_content = buffer.decode("utf-8") favicons = await self._find_favicons_links(html_content, response.url) return favicons
class FaviconType (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Expand source code
class FaviconType(Enum): URL = 0 DATA = 1
Ancestors
- enum.Enum
Class variables
var DATA
var URL
class Icon (**kwargs)
-
The Icon object
Attributes
size (int, int) : The dimensions of the favicon
extension (str) : The icon extension, .png, .ico…
type (FaviconType) : Whether the icon scheme is of type data or not.
link (yarl.URL) : The favicon URL
data (bytes) : The favicon image content
Expand source code
class Icon: ''' The Icon object Attributes: size (int, int) : The dimensions of the favicon extension (str) : The icon extension, .png, .ico... type (FaviconType) : Whether the icon scheme is of type data or not. link (yarl.URL) : The favicon URL data (bytes) : The favicon image content ''' def __init__(self, **kwargs): self.link = kwargs.get("link") self._size = None self._extension = None # If the icon of type FaviconType.DATA self.data = parse_base64_icon(kwargs.get("data")) self._path = None self.type = FaviconType.DATA if self.data else FaviconType.URL @staticmethod async def new(source: str, url: yarl.URL): ''' Create a new Icon from the source tag content. Args: source (str) : The source tag content; url (yarl.URL) : The website URL Returns: Icon ''' parsed_url = urllib.parse.urlparse(source) if parsed_url.scheme != 'data': fav_url = None # Missing scheme if not parsed_url.netloc: if parsed_url.path.startswith(':'): fav_url = yarl.URL(url.scheme + parsed_url.path) else: favicon_path = parsed_url.path match_results = re.match(r'^([\/\.\/\:]+)(.+)$', favicon_path) if match_results and len(match_results.groups()) == 2: favicon_path = '/' + match_results.group(2) else: favicon_path = '/' + favicon_path fav_url = yarl.URL.build(host=url.host, path=favicon_path) # Link look fine elif parsed_url.netloc: fav_url = yarl.URL.build(host=parsed_url.netloc, path=parsed_url.path) if parsed_url.scheme: fav_url = fav_url.with_scheme(parsed_url.scheme) else: fav_url = fav_url.with_scheme(url.scheme) icon = Icon(link=fav_url, website_url=url) else: # Data scheme: icon = Icon(data=source, website_url=url) await icon.parse(url) return icon @property def size(self) -> (int, int): return self._size @property def path(self) -> pathlib.Path: return self._path @property def extension(self) -> str: return self._extension def __str__(self): return str(self.link) async def save(self): '''Save the icon You can retrieve the favicon cached path using the path property. ''' if os.path.exists(self.path): return buffer = b'' if self.type is FaviconType.DATA: buffer = self.data else: async with aiohttp.ClientSession() as session: response = await session.get(self.link, headers=Favicon.HEADERS) async for chunck, _ in response.content.iter_chunks(): buffer += chunck with open(self.path, 'wb') as fd: fd.write(buffer) async def parse(self, website_url: yarl.URL): try: with ImageFile.Parser() as image_parser: if self.type is FaviconType.DATA: image_parser.feed(self.data) else: async with aiohttp.ClientSession() as session: response = await session.get(self.link, headers=Favicon.HEADERS) async for chunk in response.content.iter_chunked(1024): image_parser.feed(chunk) if image_parser.image: break except OSError: # PIL failed to decode the image pass # If PIL successfully decoded the image if image_parser and image_parser.image: self._size = image_parser.image.size self._extension = image_parser.image.format.lower() else: self._size = (-1, -1) self._generate_icon_name(website_url) def _generate_icon_name(self, website_url: yarl.URL) -> str: ''' Generate an icon name Args: website_url (yarl.URL): The website url Returns: str, the icon name ''' # If we don't have a base64 data image. from tempfile import NamedTemporaryFile, gettempdir if website_url: image_name = website_url.host else: image_name = os.path.basename(NamedTemporaryFile().name) if self._size: image_name += '_{}x{}'.format(*self._size) if self.type is not FaviconType.DATA: image_name += os.path.basename(self.link.path) if Favicon.DOWNLOAD_DIR: self._path = Favicon.DOWNLOAD_DIR.joinpath(image_name) else: self._path = pathlib.Path(gettempdir()).joinpath(image_name)
Static methods
async def new(source, url)
-
Create a new Icon from the source tag content.
Args
source (str) : The source tag content;
url (yarl.URL) : The website URL
Returns
Icon
Expand source code
@staticmethod async def new(source: str, url: yarl.URL): ''' Create a new Icon from the source tag content. Args: source (str) : The source tag content; url (yarl.URL) : The website URL Returns: Icon ''' parsed_url = urllib.parse.urlparse(source) if parsed_url.scheme != 'data': fav_url = None # Missing scheme if not parsed_url.netloc: if parsed_url.path.startswith(':'): fav_url = yarl.URL(url.scheme + parsed_url.path) else: favicon_path = parsed_url.path match_results = re.match(r'^([\/\.\/\:]+)(.+)$', favicon_path) if match_results and len(match_results.groups()) == 2: favicon_path = '/' + match_results.group(2) else: favicon_path = '/' + favicon_path fav_url = yarl.URL.build(host=url.host, path=favicon_path) # Link look fine elif parsed_url.netloc: fav_url = yarl.URL.build(host=parsed_url.netloc, path=parsed_url.path) if parsed_url.scheme: fav_url = fav_url.with_scheme(parsed_url.scheme) else: fav_url = fav_url.with_scheme(url.scheme) icon = Icon(link=fav_url, website_url=url) else: # Data scheme: icon = Icon(data=source, website_url=url) await icon.parse(url) return icon
Instance variables
var extension
-
Expand source code
@property def extension(self) -> str: return self._extension
var path
-
Expand source code
@property def path(self) -> pathlib.Path: return self._path
var size
-
Expand source code
@property def size(self) -> (int, int): return self._size
Methods
async def parse(self, website_url)
-
Expand source code
async def parse(self, website_url: yarl.URL): try: with ImageFile.Parser() as image_parser: if self.type is FaviconType.DATA: image_parser.feed(self.data) else: async with aiohttp.ClientSession() as session: response = await session.get(self.link, headers=Favicon.HEADERS) async for chunk in response.content.iter_chunked(1024): image_parser.feed(chunk) if image_parser.image: break except OSError: # PIL failed to decode the image pass # If PIL successfully decoded the image if image_parser and image_parser.image: self._size = image_parser.image.size self._extension = image_parser.image.format.lower() else: self._size = (-1, -1) self._generate_icon_name(website_url)
async def save(self)
-
Save the icon
You can retrieve the favicon cached path using the path property.
Expand source code
async def save(self): '''Save the icon You can retrieve the favicon cached path using the path property. ''' if os.path.exists(self.path): return buffer = b'' if self.type is FaviconType.DATA: buffer = self.data else: async with aiohttp.ClientSession() as session: response = await session.get(self.link, headers=Favicon.HEADERS) async for chunck, _ in response.content.iter_chunks(): buffer += chunck with open(self.path, 'wb') as fd: fd.write(buffer)
class Icons (**kwargs)
-
Icons, contains a lot of Icon.
Expand source code
class Icons: ''' Icons, contains a lot of Icon. ''' def __init__(self, **kwargs): self._data = [] self._current = 0 def get_largest(self, extension: str = None) -> Icon: '''Get the largest icon Args: extension (str) : The required extension Returns: Icon ''' icons = self._data if extension: icons = list(filter(lambda icon: icon.extension == extension, icons)) if icons: largest = max(icons, key=lambda icon: icon.size) return largest return None def append(self, icon: Icon): self._data.append(icon) def __iter__(self): return self def __len__(self): return len(self._data) def __next__(self): if self._current >= len(self._data): raise StopIteration else: current = self._data[self._current] self._current += 1 return current def __str__(self): return str(self._data) def __getitem__(self, key): return self._data[key]
Methods
def append(self, icon)
-
Expand source code
def append(self, icon: Icon): self._data.append(icon)
def get_largest(self, extension=None)
-
Get the largest icon
Args
extension (str) : The required extension
Returns
Icon
Expand source code
def get_largest(self, extension: str = None) -> Icon: '''Get the largest icon Args: extension (str) : The required extension Returns: Icon ''' icons = self._data if extension: icons = list(filter(lambda icon: icon.extension == extension, icons)) if icons: largest = max(icons, key=lambda icon: icon.size) return largest return None