forgejo/modules/typesniffer/typesniffer.go
Gusted dbaf3eb13c
[GITEA] Recognize OGG as an audio format
- OGG is officially an container format and is therefor classified as
`application/ogg` in RFC3534. While `audio/ogg` exists, as defined in
RFC5334, it doesn't have a different magic number and thus purely
informative, it can only be determined by parsing the file and checking
if it only contains audio data.
- A quick search on Wikimedia Commons yields that the OGG container is
by far more used for audio than for video, so it's safe to classify this
as audio, as OGG files that only contain video would now falsy be
classified as an audio file (previously it would've shown just a link to
the 'View Raw' link).
- Added unit tests.
- Resolves https://codeberg.org/forgejo/forgejo/issues/1091
2023-07-28 20:48:49 +02:00

133 lines
4 KiB
Go

// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package typesniffer
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"code.gitea.io/gitea/modules/util"
)
// Use at most this many bytes to determine Content Type.
const sniffLen = 1024
const (
// SvgMimeType MIME type of SVG images.
SvgMimeType = "image/svg+xml"
// ApplicationOctetStream MIME type of binary files.
ApplicationOctetStream = "application/octet-stream"
)
var (
svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
)
// SniffedType contains information about a blobs type.
type SniffedType struct {
contentType string
}
// IsText etects if content format is plain text.
func (ct SniffedType) IsText() bool {
return strings.Contains(ct.contentType, "text/")
}
// IsImage detects if data is an image format
func (ct SniffedType) IsImage() bool {
return strings.Contains(ct.contentType, "image/")
}
// IsSvgImage detects if data is an SVG image format
func (ct SniffedType) IsSvgImage() bool {
return strings.Contains(ct.contentType, SvgMimeType)
}
// IsPDF detects if data is a PDF format
func (ct SniffedType) IsPDF() bool {
return strings.Contains(ct.contentType, "application/pdf")
}
// IsVideo detects if data is an video format
func (ct SniffedType) IsVideo() bool {
return strings.Contains(ct.contentType, "video/")
}
// IsAudio detects if data is an video format
func (ct SniffedType) IsAudio() bool {
return strings.Contains(ct.contentType, "audio/") || strings.Contains(ct.contentType, "application/ogg")
}
// IsRepresentableAsText returns true if file content can be represented as
// plain text or is empty.
func (ct SniffedType) IsRepresentableAsText() bool {
return ct.IsText() || ct.IsSvgImage()
}
// IsBrowsableType returns whether a non-text type can be displayed in a browser
func (ct SniffedType) IsBrowsableBinaryType() bool {
return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
}
// GetMimeType returns the mime type
func (ct SniffedType) GetMimeType() string {
return strings.SplitN(ct.contentType, ";", 2)[0]
}
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
func DetectContentType(data []byte) SniffedType {
if len(data) == 0 {
return SniffedType{"text/unknown"}
}
ct := http.DetectContentType(data)
if len(data) > sniffLen {
data = data[:sniffLen]
}
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
detectByXML := strings.Contains(ct, "text/xml")
if detectByHTML || detectByXML {
dataProcessed := svgComment.ReplaceAll(data, nil)
dataProcessed = bytes.TrimSpace(dataProcessed)
if detectByHTML && svgTagRegex.Match(dataProcessed) ||
detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
ct = SvgMimeType
}
}
if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
ct2 := http.DetectContentType(data[3:])
if strings.HasPrefix(ct2, "text/") {
ct = ct2
}
}
return SniffedType{ct}
}
// DetectContentTypeFromReader guesses the content type contained in the reader.
func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
buf := make([]byte, sniffLen)
n, err := util.ReadAtMost(r, buf)
if err != nil {
return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
}
buf = buf[:n]
return DetectContentType(buf), nil
}