Improved recognition of MIME types with file(1).
This commit is contained in:
parent
9f8c941e2a
commit
fdf52fbbcc
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
* Improved listing of available archive formats.
|
* Improved listing of available archive formats.
|
||||||
* Added support for Windows systems.
|
* Added support for Windows systems.
|
||||||
|
* Improved recognition of MIME types.
|
||||||
|
|
||||||
0.3 "Management" (released 23.2.2010)
|
0.3 "Management" (released 23.2.2010)
|
||||||
|
|
||||||
|
|
|
@ -156,10 +156,6 @@ def get_archive_format (filename):
|
||||||
raise util.PatoolError("unknown archive format for file `%s'" % filename)
|
raise util.PatoolError("unknown archive format for file `%s'" % filename)
|
||||||
if mime in ArchiveMimetypes:
|
if mime in ArchiveMimetypes:
|
||||||
format = ArchiveMimetypes[mime]
|
format = ArchiveMimetypes[mime]
|
||||||
elif encoding in ArchiveEncodings:
|
|
||||||
# Files like 't.txt.gz' are recognized with encoding as format, and
|
|
||||||
# an unsupported mime-type like 'text/plain'. Fix this.
|
|
||||||
format, encoding = encoding, None
|
|
||||||
else:
|
else:
|
||||||
raise util.PatoolError("unknown archive format for file `%s' (mime-type is `%s')" % (filename, mime))
|
raise util.PatoolError("unknown archive format for file `%s' (mime-type is `%s')" % (filename, mime))
|
||||||
if format == encoding:
|
if format == encoding:
|
||||||
|
|
121
patoolib/util.py
121
patoolib/util.py
|
@ -83,21 +83,122 @@ def run (cmd, **kwargs):
|
||||||
|
|
||||||
@memoized
|
@memoized
|
||||||
def guess_mime (filename):
|
def guess_mime (filename):
|
||||||
"""Guess the MIME type of given filename. Uses first mimetypes
|
"""Guess the MIME type of given filename using three methods:
|
||||||
and then file(1) as fallback."""
|
(a) using file(1) --mime
|
||||||
mime, encoding = mimedb.guess_type(filename, strict=False)
|
(b) using file(1) and look the result string
|
||||||
if mime is None and os.path.isfile(filename):
|
(c) looking at the filename extension with the Python mimetypes module
|
||||||
|
|
||||||
|
Of course only (c) will be eventually successful if the system does not
|
||||||
|
have the file(1) program installed or the given file is not readable.
|
||||||
|
The encoding is determined by method (c).
|
||||||
|
|
||||||
|
The result of this function is cached.
|
||||||
|
"""
|
||||||
|
mime, encoding = None, None
|
||||||
|
if os.path.isfile(filename):
|
||||||
file_prog = find_program("file")
|
file_prog = find_program("file")
|
||||||
if file_prog:
|
if file_prog:
|
||||||
cmd = [file_prog, "--brief", "--mime-type", filename]
|
mime, encoding = guess_mime_file_mime(file_prog, filename)
|
||||||
try:
|
if mime is None:
|
||||||
mime = backtick(cmd).strip()
|
mime = guess_mime_file(file_prog, filename)
|
||||||
except OSError, msg:
|
if mime is None:
|
||||||
# ignore errors, as file(1) is only a fallback
|
mime, encoding = guess_mime_mimedb(filename)
|
||||||
pass
|
assert mime is not None or encoding is None
|
||||||
return mime, encoding
|
return mime, encoding
|
||||||
|
|
||||||
|
|
||||||
|
Encoding2Mime = {
|
||||||
|
'gzip': "application/x-gzip",
|
||||||
|
'bzip2': "application/x-bzip2",
|
||||||
|
'compress': "application/x-compress",
|
||||||
|
'lzma': "application/x-lzma",
|
||||||
|
'xz': "application/x-xz",
|
||||||
|
}
|
||||||
|
Mime2Encoding = dict([(value, key) for key, value in Encoding2Mime.items()])
|
||||||
|
|
||||||
|
|
||||||
|
def guess_mime_mimedb (filename):
|
||||||
|
"""Guess MIME type from given filename."""
|
||||||
|
mime, encoding = mimedb.guess_type(filename, strict=False)
|
||||||
|
from patoolib import ArchiveMimetypes, ArchiveEncodings
|
||||||
|
if mime not in ArchiveMimetypes and encoding in ArchiveEncodings:
|
||||||
|
# Files like 't.txt.gz' are recognized with encoding as format, and
|
||||||
|
# an unsupported mime-type like 'text/plain'. Fix this.
|
||||||
|
mime = Encoding2Mime[encoding]
|
||||||
|
encoding = None
|
||||||
|
return mime, encoding
|
||||||
|
|
||||||
|
|
||||||
|
def guess_mime_file_mime (file_prog, filename):
|
||||||
|
"""Determine MIME type of filename with file(1) and --mime option."""
|
||||||
|
mime, encoding = None, None
|
||||||
|
cmd = [file_prog, "--brief", "--mime-type", filename]
|
||||||
|
try:
|
||||||
|
mime = backtick(cmd).strip()
|
||||||
|
except OSError, msg:
|
||||||
|
# ignore errors, as file(1) is only a fallback
|
||||||
|
return mime, encoding
|
||||||
|
from patoolib import ArchiveMimetypes
|
||||||
|
if mime in Encoding2Mime.values():
|
||||||
|
# try to look inside compressed archives
|
||||||
|
cmd = [file_prog, "--brief", "--mime", "--uncompress", filename]
|
||||||
|
try:
|
||||||
|
outparts = backtick(cmd).strip().split(";")
|
||||||
|
except OSError, msg:
|
||||||
|
# ignore errors, as file(1) is only a fallback
|
||||||
|
return mime, encoding
|
||||||
|
mime2 = outparts[0]
|
||||||
|
if mime2 in ArchiveMimetypes:
|
||||||
|
mime = mime2
|
||||||
|
encoding = get_file_mime_encoding(outparts)
|
||||||
|
if mime not in ArchiveMimetypes:
|
||||||
|
mime, encoding = None, None
|
||||||
|
return mime, encoding
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_mime_encoding (parts):
|
||||||
|
"""Get encoding value from splitted output of file --mime --uncompress."""
|
||||||
|
for part in parts:
|
||||||
|
for subpart in part.split(" "):
|
||||||
|
if subpart.startswith("compressed-encoding="):
|
||||||
|
mime = subpart.split("=")[1].strip()
|
||||||
|
return Mime2Encoding.get(mime)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# Match file(1) output text to mime types
|
||||||
|
FileText2Mime = {
|
||||||
|
"7-zip archive data": "application/x-7z-compressed",
|
||||||
|
"ARJ archive data": "application/x-arj",
|
||||||
|
"bzip2 compressed data": "application/x-bzip2",
|
||||||
|
"cpio archive": "application/x-cpio",
|
||||||
|
"Debian binary package": "application/x-debian-package",
|
||||||
|
"gzip compressed data": "application/x-gzip",
|
||||||
|
"lzop compressed data": "application/x-lzop",
|
||||||
|
"Microsoft Cabinet archive data": "application/vnd.ms-cab-compressed",
|
||||||
|
"RAR archive data": "application/x-rar",
|
||||||
|
"RPM ": "application/x-redhat-package-manager",
|
||||||
|
"POSIX tar archive": "application/x-tar",
|
||||||
|
"xz compressed data": "application/x-xz",
|
||||||
|
"Zip archive data": "application/zip",
|
||||||
|
"compress'd data": "application/x-compress",
|
||||||
|
}
|
||||||
|
|
||||||
|
def guess_mime_file (file_prog, filename):
|
||||||
|
"""Determine MIME type of filename with file(1)."""
|
||||||
|
cmd = [file_prog, "--brief", filename]
|
||||||
|
try:
|
||||||
|
output = backtick(cmd).strip()
|
||||||
|
except OSError, msg:
|
||||||
|
# ignore errors, as file(1) is only a fallback
|
||||||
|
return None
|
||||||
|
# match output against known strings
|
||||||
|
for matcher, mime in FileText2Mime.items():
|
||||||
|
if output.startswith(matcher):
|
||||||
|
return mime
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def check_filename (filename):
|
def check_filename (filename):
|
||||||
"""Ensure that given filename is a valid, existing file."""
|
"""Ensure that given filename is a valid, existing file."""
|
||||||
if not os.path.isfile(filename):
|
if not os.path.isfile(filename):
|
||||||
|
|
Loading…
Reference in New Issue