unhtml: HTML unmarshaler

最近要用 golang 写一个需要解析 HTML 的项目,到网上找了一个库叫 goquery 。虽然它的 API 挺不错, css selector 基本上也全支持了,但写这种代码果然还是有点无聊,于是我就想,为什么不能跟 gojson 库和 xml 库一样,直接 Unmarshal(HTML) 呢?

然后我花了两天时间撸出了 unhtml -> Github 传送门

样例 & 性能

有个 HTML

var AllTypeHTML = []byte(`
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <ul>
            <li>0</li>
            <li>1</li>
            <li>2</li>
            <li>3</li>
        </ul>
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <p>Hello World!</p>
        <p>10</p>
        <p>3.14</p>
        <p>true</p>
    </div>
</body>
</html>
`)

如果你想把它解析为一个结构体

package example

type (
	PartTypesStruct struct {
		Slice   []int    
		Struct  TestUser 
		String  string   
		Int     int      
		Float64 float64  
		Bool    bool     
	}

	TestUser struct {
		Name      string 
		Age       uint   
		LikeLemon bool   
	}
)

直接用 goquery 要这样写

package example

import (
	"bytes"
	"github.com/PuerkitoBio/goquery"
	"strconv"
)

func parsePartTypesLogically() (PartTypesStruct, error) {
	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML))
	partTypes := PartTypesStruct{}
	if err == nil {
		selection := doc.Find(partTypes.Root())
		partTypes.Slice = make([]int, 0)
		selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) {
			Int, parseErr := strconv.Atoi(selection.Text())
			if parseErr != nil {
				err = parseErr
			}
			partTypes.Slice = append(partTypes.Slice, Int)
		})
		if err == nil {
			partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text()
			Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text())
			if err = parseErr; err == nil {
				partTypes.Struct.Age = uint(Int)
				Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text())
				if err = parseErr; err == nil {
					partTypes.Struct.LikeLemon = Bool

					String := selection.Find(`#test > p:nth-child(3)`).Text()
					Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
					if err = parseErr; err != nil {
						return partTypes, err
					}

					Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
					if err = parseErr; err != nil {
						return partTypes, err
					}

					Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text())
					if err = parseErr; err != nil {
						return partTypes, err
					}
					partTypes.String = String
					partTypes.Int = Int
					partTypes.Float64 = Float64
					partTypes.Bool = Bool
				}
			}
		}
	}
	return partTypes, err
}

写得很难受

而现在你只要这么写

package main

import (
	"encoding/json"
	"fmt"
	"github.com/Hexilee/unhtml"
	"io/ioutil"
)

type (
	PartTypesStruct struct {
		Slice   []int    `html:"ul > li"`
		Struct  TestUser `html:"#test > div"`
		String  string   `html:"#test > p:nth-child(3)"`
		Int     int      `html:"#test > p:nth-child(4)"`
		Float64 float64  `html:"#test > p:nth-child(5)"`
		Bool    bool     `html:"#test > p:nth-child(6)"`
	}
	
	TestUser struct {
		Name      string `html:"p:nth-child(1)"`
		Age       uint   `html:"p:nth-child(2)"`
		LikeLemon bool   `html:"p:nth-child(3)"`
	}
)

func (PartTypesStruct) Root() string {
	return "#test"
}

func main() {
	allTypes := PartTypesStruct{}
	_ := unhtml.Unmarshal(AllTypeHTML, &allTypes)
	result, _ := json.Marshal(&allTypes)
	fmt.Println(string(result))
}

就能得到结果

{
  "Slice": [
    0,
    1,
    2,
    3
  ],
  "Struct": {
    "Name": "Hexilee",
    "Age": 20,
    "LikeLemon": true
  },
  "String": "Hello World!",
  "Int": 10,
  "Float64": 3.14,
  "Bool": true
}

开发效率大大提升!但毫无疑问用了大量反射,让人担心它的运行效率。于是我写了两个 Benchmarks

func BenchmarkUnmarshalPartTypes(b *testing.B) {
	assert.NotNil(b, AllTypeHTML)
	for i := 0; i < b.N; i++ {
		partTypes := PartTypesStruct{}
		assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes))
	}
}

func BenchmarkParsePartTypesLogically(b *testing.B) {
	assert.NotNil(b, AllTypeHTML)
	for i := 0; i < b.N; i++ {
		_, err := parsePartTypesLogically()
		assert.Nil(b, err)
	}
}

测试结果:

> go test -bench=.
goos: darwin
goarch: amd64
pkg: github.com/Hexilee/unhtml
BenchmarkUnmarshalPartTypes-4        	   30000	     54096 ns/op
BenchmarkParsePartTypesLogically-4   	   30000	     45188 ns/op
PASS
ok  	github.com/Hexilee/unhtml	4.098s

运行效率稍微低些,但这只是展示和测试用的 HTML ,在解析实际中更复杂的 HTML 时两者的运行效率是十分接近的。

一些注意事项和特性请看 README

我来评几句
登录后评论

已发表评论数()

相关站点

热门文章