mirror of
https://codeberg.org/forgejo/forgejo
synced 2024-09-17 12:21:18 +02:00
dbaf3eb13c
- OGG is officially an container format and is therefor classified as `application/ogg` in RFC3534. While `audio/ogg` exists, as defined in RFC5334, it doesn't have a different magic number and thus purely informative, it can only be determined by parsing the file and checking if it only contains audio data. - A quick search on Wikimedia Commons yields that the OGG container is by far more used for audio than for video, so it's safe to classify this as audio, as OGG files that only contain video would now falsy be classified as an audio file (previously it would've shown just a link to the 'View Raw' link). - Added unit tests. - Resolves https://codeberg.org/forgejo/forgejo/issues/1091
133 lines
4 KiB
Go
133 lines
4 KiB
Go
// Copyright 2021 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package typesniffer
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"code.gitea.io/gitea/modules/util"
|
|
)
|
|
|
|
// Use at most this many bytes to determine Content Type.
|
|
const sniffLen = 1024
|
|
|
|
const (
|
|
// SvgMimeType MIME type of SVG images.
|
|
SvgMimeType = "image/svg+xml"
|
|
// ApplicationOctetStream MIME type of binary files.
|
|
ApplicationOctetStream = "application/octet-stream"
|
|
)
|
|
|
|
var (
|
|
svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
|
|
svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
|
|
svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
|
|
)
|
|
|
|
// SniffedType contains information about a blobs type.
|
|
type SniffedType struct {
|
|
contentType string
|
|
}
|
|
|
|
// IsText etects if content format is plain text.
|
|
func (ct SniffedType) IsText() bool {
|
|
return strings.Contains(ct.contentType, "text/")
|
|
}
|
|
|
|
// IsImage detects if data is an image format
|
|
func (ct SniffedType) IsImage() bool {
|
|
return strings.Contains(ct.contentType, "image/")
|
|
}
|
|
|
|
// IsSvgImage detects if data is an SVG image format
|
|
func (ct SniffedType) IsSvgImage() bool {
|
|
return strings.Contains(ct.contentType, SvgMimeType)
|
|
}
|
|
|
|
// IsPDF detects if data is a PDF format
|
|
func (ct SniffedType) IsPDF() bool {
|
|
return strings.Contains(ct.contentType, "application/pdf")
|
|
}
|
|
|
|
// IsVideo detects if data is an video format
|
|
func (ct SniffedType) IsVideo() bool {
|
|
return strings.Contains(ct.contentType, "video/")
|
|
}
|
|
|
|
// IsAudio detects if data is an video format
|
|
func (ct SniffedType) IsAudio() bool {
|
|
return strings.Contains(ct.contentType, "audio/") || strings.Contains(ct.contentType, "application/ogg")
|
|
}
|
|
|
|
// IsRepresentableAsText returns true if file content can be represented as
|
|
// plain text or is empty.
|
|
func (ct SniffedType) IsRepresentableAsText() bool {
|
|
return ct.IsText() || ct.IsSvgImage()
|
|
}
|
|
|
|
// IsBrowsableType returns whether a non-text type can be displayed in a browser
|
|
func (ct SniffedType) IsBrowsableBinaryType() bool {
|
|
return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
|
|
}
|
|
|
|
// GetMimeType returns the mime type
|
|
func (ct SniffedType) GetMimeType() string {
|
|
return strings.SplitN(ct.contentType, ";", 2)[0]
|
|
}
|
|
|
|
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
|
|
func DetectContentType(data []byte) SniffedType {
|
|
if len(data) == 0 {
|
|
return SniffedType{"text/unknown"}
|
|
}
|
|
|
|
ct := http.DetectContentType(data)
|
|
|
|
if len(data) > sniffLen {
|
|
data = data[:sniffLen]
|
|
}
|
|
|
|
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
|
|
|
|
detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
|
|
detectByXML := strings.Contains(ct, "text/xml")
|
|
if detectByHTML || detectByXML {
|
|
dataProcessed := svgComment.ReplaceAll(data, nil)
|
|
dataProcessed = bytes.TrimSpace(dataProcessed)
|
|
if detectByHTML && svgTagRegex.Match(dataProcessed) ||
|
|
detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
|
|
ct = SvgMimeType
|
|
}
|
|
}
|
|
|
|
if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
|
|
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
|
|
// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
|
|
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
|
|
ct2 := http.DetectContentType(data[3:])
|
|
if strings.HasPrefix(ct2, "text/") {
|
|
ct = ct2
|
|
}
|
|
}
|
|
|
|
return SniffedType{ct}
|
|
}
|
|
|
|
// DetectContentTypeFromReader guesses the content type contained in the reader.
|
|
func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
|
|
buf := make([]byte, sniffLen)
|
|
n, err := util.ReadAtMost(r, buf)
|
|
if err != nil {
|
|
return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
|
|
}
|
|
buf = buf[:n]
|
|
|
|
return DetectContentType(buf), nil
|
|
}
|