antchfx / xpath Goto Github PK
View Code? Open in Web Editor NEWXPath package for Golang, supports HTML, XML, JSON document query.
License: MIT License
XPath package for Golang, supports HTML, XML, JSON document query.
License: MIT License
As I mentioned in #44, I need reverse()
function which is part of xpath 2.0.
I did a quick prototyping. I'd like to have main contributors (@zhengchun ?) take a look to see if I'm completely off or not, before I create a pull request:
Below is the prototype diff (Not many tests added yet). Note the diff is large-ish because I renamed the existing functionQuery
to scalarFunctionQuery
to differentiate it from my newly added selectFunctionQuery
(which currently only has reverse
function but I think we can merge unionQuery
and logicalQuery
into selectFunctionQuery
eventually, tell me if I'm crazy)
diff --git a/build.go b/build.go
index be1214c..d6ff928 100644
--- a/build.go
+++ b/build.go
@@ -173,7 +173,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: startwithFunc(arg1, arg2)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: startwithFunc(arg1, arg2)}
case "ends-with":
arg1, err := b.processNode(root.Args[0])
if err != nil {
@@ -183,7 +183,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: endwithFunc(arg1, arg2)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: endwithFunc(arg1, arg2)}
case "contains":
arg1, err := b.processNode(root.Args[0])
if err != nil {
@@ -194,7 +194,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: containsFunc(arg1, arg2)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: containsFunc(arg1, arg2)}
case "substring":
//substring( string , start [, length] )
if len(root.Args) < 2 {
@@ -215,7 +215,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
return nil, err
}
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: substringFunc(arg1, arg2, arg3)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: substringFunc(arg1, arg2, arg3)}
case "substring-before", "substring-after":
//substring-xxxx( haystack, needle )
if len(root.Args) != 2 {
@@ -231,7 +231,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if arg2, err = b.processNode(root.Args[1]); err != nil {
return nil, err
}
- qyOutput = &functionQuery{
+ qyOutput = &scalarFunctionQuery{
Input: b.firstInput,
Func: substringIndFunc(arg1, arg2, root.FuncName == "substring-after"),
}
@@ -244,7 +244,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: stringLengthFunc(arg1)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: stringLengthFunc(arg1)}
case "normalize-space":
if len(root.Args) == 0 {
return nil, errors.New("xpath: normalize-space function must have at least one parameter")
@@ -253,7 +253,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: argQuery, Func: normalizespaceFunc}
+ qyOutput = &scalarFunctionQuery{Input: argQuery, Func: normalizespaceFunc}
case "replace":
//replace( string , string, string )
if len(root.Args) != 3 {
@@ -272,7 +272,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if arg3, err = b.processNode(root.Args[2]); err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: replaceFunc(arg1, arg2, arg3)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: replaceFunc(arg1, arg2, arg3)}
case "translate":
//translate( string , string, string )
if len(root.Args) != 3 {
@@ -291,7 +291,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if arg3, err = b.processNode(root.Args[2]); err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: translateFunc(arg1, arg2, arg3)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: translateFunc(arg1, arg2, arg3)}
case "not":
if len(root.Args) == 0 {
return nil, errors.New("xpath: not function must have at least one parameter")
@@ -300,7 +300,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: argQuery, Func: notFunc}
+ qyOutput = &scalarFunctionQuery{Input: argQuery, Func: notFunc}
case "name", "local-name", "namespace-uri":
if len(root.Args) > 1 {
return nil, fmt.Errorf("xpath: %s function must have at most one parameter", root.FuncName)
@@ -317,24 +317,24 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
}
switch root.FuncName {
case "name":
- qyOutput = &functionQuery{Input: b.firstInput, Func: nameFunc(arg)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: nameFunc(arg)}
case "local-name":
- qyOutput = &functionQuery{Input: b.firstInput, Func: localNameFunc(arg)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: localNameFunc(arg)}
case "namespace-uri":
- qyOutput = &functionQuery{Input: b.firstInput, Func: namespaceFunc(arg)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: namespaceFunc(arg)}
}
case "true", "false":
val := root.FuncName == "true"
- qyOutput = &functionQuery{
+ qyOutput = &scalarFunctionQuery{
Input: b.firstInput,
Func: func(_ query, _ iterator) interface{} {
return val
},
}
case "last":
- qyOutput = &functionQuery{Input: b.firstInput, Func: lastFunc}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: lastFunc}
case "position":
- qyOutput = &functionQuery{Input: b.firstInput, Func: positionFunc}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: positionFunc}
case "boolean", "number", "string":
inp := b.firstInput
if len(root.Args) > 1 {
@@ -347,7 +347,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
}
inp = argQuery
}
- f := &functionQuery{Input: inp}
+ f := &scalarFunctionQuery{Input: inp}
switch root.FuncName {
case "boolean":
f.Func = booleanFunc
@@ -368,7 +368,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: argQuery, Func: countFunc}
+ qyOutput = &scalarFunctionQuery{Input: argQuery, Func: countFunc}
case "sum":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: sum(node-sets) function must with have parameters node-sets")
@@ -377,7 +377,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- qyOutput = &functionQuery{Input: argQuery, Func: sumFunc}
+ qyOutput = &scalarFunctionQuery{Input: argQuery, Func: sumFunc}
case "ceiling", "floor", "round":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: ceiling(node-sets) function must with have parameters node-sets")
@@ -386,7 +386,7 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
- f := &functionQuery{Input: argQuery}
+ f := &scalarFunctionQuery{Input: argQuery}
switch root.FuncName {
case "ceiling":
f.Func = ceilingFunc
@@ -408,7 +408,16 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
}
args = append(args, q)
}
- qyOutput = &functionQuery{Input: b.firstInput, Func: concatFunc(args...)}
+ qyOutput = &scalarFunctionQuery{Input: b.firstInput, Func: concatFunc(args...)}
+ case "reverse":
+ if len(root.Args) == 0 {
+ return nil, fmt.Errorf("xpath: reverse(node-sets) function must with have parameters node-sets")
+ }
+ argQuery, err := b.processNode(root.Args[0])
+ if err != nil {
+ return nil, err
+ }
+ qyOutput = &selectFunctionQuery{Input: argQuery, Func: reverseFunc}
default:
return nil, fmt.Errorf("not yet support this function %s()", root.FuncName)
}
@@ -464,7 +473,7 @@ func (b *builder) processOperatorNode(root *operatorNode) (query, error) {
qyOutput = &booleanQuery{Left: left, Right: right, IsOr: isOr}
case "|":
qyOutput = &unionQuery{Left: left, Right: right}
- }
+ }
return qyOutput, nil
}
diff --git a/func.go b/func.go
index 1edeb1d..12d0ec2 100644
--- a/func.go
+++ b/func.go
@@ -526,8 +526,28 @@ func concatFunc(args ...query) func(query, iterator) interface{} {
// https://github.com/antchfx/xpath/issues/43
func functionArgs(q query) query {
- if _, ok := q.(*functionQuery); ok {
+ if _, ok := q.(*scalarFunctionQuery); ok {
return q
}
return q.Clone()
}
+
+func reverseFunc(q query, t iterator) func() NodeNavigator {
+ var list []NodeNavigator
+ for {
+ node := q.Select(t)
+ if node == nil {
+ break
+ }
+ list = append(list, node.Copy())
+ }
+ i := len(list) - 1
+ return func() NodeNavigator {
+ if i < 0 {
+ return nil
+ }
+ node := list[i]
+ i--
+ return node
+ }
+}
diff --git a/query.go b/query.go
index afeb890..115fdc8 100644
--- a/query.go
+++ b/query.go
@@ -590,25 +590,48 @@ func (f *filterQuery) Clone() query {
return &filterQuery{Input: f.Input.Clone(), Predicate: f.Predicate.Clone()}
}
-// functionQuery is an XPath function that call a function to returns
+// scalarFunctionQuery is an XPath function that call a function to returns
// value of current NodeNavigator node.
-type functionQuery struct {
+type scalarFunctionQuery struct {
Input query // Node Set
Func func(query, iterator) interface{} // The xpath function.
}
-func (f *functionQuery) Select(t iterator) NodeNavigator {
+func (f *scalarFunctionQuery) Select(t iterator) NodeNavigator {
return nil
}
// Evaluate call a specified function that will returns the
// following value type: number,string,boolean.
-func (f *functionQuery) Evaluate(t iterator) interface{} {
+func (f *scalarFunctionQuery) Evaluate(t iterator) interface{} {
return f.Func(f.Input, t)
}
-func (f *functionQuery) Clone() query {
- return &functionQuery{Input: f.Input.Clone(), Func: f.Func}
+func (f *scalarFunctionQuery) Clone() query {
+ return &scalarFunctionQuery{Input: f.Input.Clone(), Func: f.Func}
+}
+
+type selectFunctionQuery struct {
+ Input query
+ Func func(query, iterator) func() NodeNavigator
+ iterator func() NodeNavigator
+}
+
+func (f *selectFunctionQuery) Select(t iterator) NodeNavigator {
+ if f.iterator == nil {
+ f.iterator = f.Func(f.Input, t)
+ }
+ return f.iterator()
+}
+
+func (f *selectFunctionQuery) Evaluate(t iterator) interface{} {
+ // QUESTION: when is this ever called?
+ f.iterator = f.Func(f.Input, t)
+ return f
+}
+
+func (f *selectFunctionQuery) Clone() query {
+ return &selectFunctionQuery{Input: f.Input.Clone(), Func: f.Func}
}
// constantQuery is an XPath constant operand.
diff --git a/xpath_test.go b/xpath_test.go
index 7b7d14b..edb222d 100644
--- a/xpath_test.go
+++ b/xpath_test.go
@@ -1,9 +1,9 @@
package xpath
import (
- "bytes"
- "strings"
- "testing"
+ "bytes"
+ "strings"
+ "testing"
)
var (
@@ -729,3 +729,12 @@ func example() *TNode {
return xhtml
}
+
+func TestReverse(t *testing.T) {
+ list := selectNodes(html, "reverse(//li)")
+ if len(list) != 4 { t.Fatal("reverse(//li) should return 4 <li> nodes") }
+ if list[0].Value() != "" { t.Fatal("reverse(//li)[0].Value() should be ''") }
+ if list[1].Value() != "login" { t.Fatal("reverse(//li)[0].Value() should be 'login'") }
+ if list[2].Value() != "about" { t.Fatal("reverse(//li)[0].Value() should be 'about'") }
+ if list[3].Value() != "Home" { t.Fatal("reverse(//li)[0].Value() should be 'Home'") }
+}
Thank you very much!
Hi @zhengchun,
I was working with your library and I've noticed the normalize-space
function is not replacing internal whitespace characters with one space as per the spec:
https://www.w3.org/TR/1999/REC-xpath-19991116/#function-normalize-space
The normalize-space function returns the argument string with whitespace normalized by stripping leading and trailing whitespace and replacing sequences of whitespace characters by a single space. [...]
Here's the minimal Go code that reproduces the issue:
package main
import (
"fmt"
"github.com/antchfx/htmlquery"
"os"
"strings"
)
func mustSucceed(err error) {
if err != nil {
fmt.Printf("Error: %v", err)
os.Exit(1)
}
}
func main() {
htmlDoc := `<!doctype html>
<html lang=en>
<head>
<meta charset=utf-8>
<title></title>
</head>
<body>
<div>Match
me</div>
</body>
</html>`
root, err := htmlquery.Parse(strings.NewReader(htmlDoc))
mustSucceed(err)
node := htmlquery.FindOne(root, "//div[normalize-space(text())=\"Match me\"]")
if node == nil {
fmt.Printf("No matches")
}
}
This would print No matches
.
Thank you for your time and effort!
<message>
<from>[email protected]</from>
<to>[email protected]</to>
<body ctime="1 January 1980">
Isn't XPATH awsome?
</body>
</message>
/message/body/@ctime
The xpath library will return the <body>
node rather than the ctime
attribute of the node.
$ cat example.xml
<message>
<from>[email protected]</from>
<to>[email protected]</to>
<body ctime="1 January 1980">
Isn't XPATH awsome?
</body>
</message>
$ xmllint --version
xmllint: using libxml version 20903
compiled with: Threads Tree Output Push Reader Patterns Writer SAXv1 FTP HTTP DTDValid HTML Legacy C14N Catalog XPath XPointer XInclude Iconv ISO8859X Unicode Regexps Automata Expr Schemas Schematron Modules Debug Zlib Lzma
$ xmllint --xpath "/message/body/@ctime" ./example.xml
ctime="1 January 1980"
I also believe that this should be the behavior after reading XPATH 3.0 wherein it states:
Here are some examples of path expressions that use the abbreviated syntax:
para
selects the para element children of the context node*
selects all element children of the context nodetext()
selects all text node children of the context node@name
selects the name attribute of the context node
I'm currently working on my fork of this repo to produce a test which will specifically support node attribute selection
func TestXPathName(t *testing.T) {
doc, err := jsonquery.Parse(strings.NewReader(`{
"Reviews": {
"A Tale of Two Cities": "Not Bad"
},
"Books": [
{
"Title": "Of Mice and Men",
"Price": 11
},
{
"Title": "A Tale of Two Cities",
"Price": 10
}
]
}`,
))
assert.NoError(t, err)
currentBookPrice, err := jsonquery.Query(doc, `/Books/*[Title=name(/Reviews/*)]/Price`)
assert.NoError(t, err)
assert.NotNil(t, currentBookPrice)
}
When running the query the filter evaluates it's expression multiple times. However the name()
function always runs Select(t)
on the same query. Because query's contain their own iterator the first time the name function is evaluated it returns the name of the first match, and the second time it returns the second match. This can be demonstrated in the example by removing "Of Mice and Men" from the JSON document which will allow the test to pass.
I admit I don't fully understand the expression logic, but it seems as like when a function like name
calls Select(t)
on its argument it should call Clone()
first to avoid using the same iterator.
see: antchfx/xquery#15
HI friends, thank you for building this project, I am not very familiar with XPATH, and have a small question
if I have a node that looks like this:
<books>
<book> book name. </book>
</books>
and I ask for this expression:
//books/book
then, is there a way for the xpath to return the normalize-string()'d version of the book name?
as in, replace " book name. " and return only "book name." without any trailing or leading spaces?
is that even possible with xpath.
how to deal with?
Jenkins config files both xml1.1
This happens when calling .InnerText
when a page happens to be blank (such as a redirect).
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x28 pc=0x6a496a]
goroutine 8 [running]:
github.com/antchfx/htmlquery.(*NodeNavigator).NodeType(0xc0000a08b8, 0x40f35b)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/query.go:205 +0x2a
github.com/antchfx/xpath.axisPredicate.func1(0x7afca0, 0xc0000a08b8, 0x7f460faa35b8)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/build.go:45 +0x51
github.com/antchfx/xpath.(*builder).processAxisNode.func1(0x7afca0, 0xc0000a08b8, 0xc0000adc10)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/build.go:82 +0x46
github.com/antchfx/xpath.(*descendantQuery).Select.func1(0x71ffa0, 0xc00020a900)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/query.go:252 +0x1e3
github.com/antchfx/xpath.(*descendantQuery).Select(0xc0000dc640, 0x7a89a0, 0xc0000ca740, 0x20, 0x705340)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/query.go:284 +0x34
github.com/antchfx/xpath.(*NodeIterator).MoveNext(0xc0000ca740, 0xc0000ca740)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/xpath.go:86 +0x49
github.com/antchfx/htmlquery.QuerySelector(0x0, 0xc0000ca700, 0xc0000ca700)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/query.go:73 +0xe8
github.com/antchfx/htmlquery.Query(0x0, 0x744dbd, 0x7, 0x7a8e00, 0xc00020a810, 0x0)
/home/acook/go/pkg/mod/github.com/antchfx/[email protected]/query.go:67 +0x85
main.getElementFromURL(0xc0000b27d0, 0x4, 0x744dbd, 0x7, 0x1, 0x11)
/home/acook/Source/Mein/Go/TuneBot/main.go:142 +0x75
I assume that XPath 2.0 is not (fully) implemented (yet), so feel free to close this issue, as XPath 1 does not support this expression.
This program:
package main
import (
"log"
"github.com/antchfx/xpath"
)
func main() {
_, err := xpath.Compile(`(4,2)`)
if err != nil {
log.Fatal(err)
}
}
writes out (4,2) has an invalid token
.
v1.2.0
I'd expect the valid sequence (4,2)
to return no error.
t.Run("Xml", func(t *testing.T) {
doc, _ := xmlquery.Parse(strings.NewReader(`<soapenv:Envelope
xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/">
<soapenv:Header/>
<soapenv:Body>
<tem:GetOtpuskTovarov>
<tem:_dateFrom>?</tem:_dateFrom>
<tem:_dateTo>?</tem:_dateTo>
</tem:GetOtpuskTovarov>
</soapenv:Body>
</soapenv:Envelope>
`))
fmt.Println(doc.OutputXML(false))
if strings.Index(doc.OutputXML(false), "<tem:_dateFrom>") < 0 {
t.Error("OutputXML")
return
}
})
stdout:
<?xml?><Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/"><Header></Header><Body><GetOtpuskTovarov><_dateFrom>!!!</_dateFrom><_dateTo>?</_dateTo></GetOtpuskTovarov></Body></Envelope>
Steps to reproduce: save the content of view-source:https://www.productfrom.com/product/416492-adidas-copa-mundial-soccer-shoes
in test.html
package main
import (
"fmt"
"github.com/antchfx/htmlquery"
)
func main() {
doc, err := htmlquery.LoadDoc("test.html")
if err != nil {
panic(err)
}
product := htmlquery.FindOne(doc, "//div[contains(@class, 'grid grid-cols-12 gap-0 border-t py-2 px-4')]//div[contains(.,'Product Name')]/..//div[contains(@class, 'text:right')]//span")
if product != nil {
fmt.Println("product:", htmlquery.InnerText(product))
}
}
When I test the xpath expression online, e.g. here: https://htmlstrip.com/xpath-tester, then it finds a match using this expression: //div[contains(@class, 'grid grid-cols-12 gap-0 border-t py-2 px-4')]//div[contains(.,'Product Name')]/..//div[contains(@class, ' text:right')]//span
.
Hi,
I have an issue with the xpath library, when I use ./rootElem
on the following document:
<rootElem>
foobar
</rootElem>
I expect XPath to select the root element, but instead I get nothing. When looking at FreeFormatter XPath processor with this use case, I have a non empty result.
I think I could debug this back to the axis predicate function. In build.go:
Lines 40 to 47 in b4b6b74
While evaluating the contextQuery, the condition typ == n.NodeType() || typ == TextNode
is false because the expected type, typ
is ElementNode
(1) while n.NodeType()
is a RootNode
(0). The trick here is that the root node is technically an element too, and it shouldn't be treated differently.
The context query ends up not selecting anything from the document and the rest of the xpath evaluation cannot take place.
I have a test case ready here:
Hello,
According to docs, sum and multiply is implemented, but I can't get this to work:
func main() {
doc, err := xmlquery.Parse(strings.NewReader(`
<items>
<item type="goods">
<quantity unit="pcs">1</quantity>
<unit-price currency="PLN">2</unit-price>
<VAT>7</VAT>
</item>
<item type="goods">
<quantity unit="pcs">10</quantity>
<unit-price currency="PLN">20</unit-price>
<VAT>7</VAT>
</item>
<item type="goods">
<quantity unit="pcs">100</quantity>
<unit-price currency="PLN">200</unit-price>
<VAT>7</VAT>
</item>
</items>
`))
expr, err := xpath.Compile("sum(/items/item/(quantity * unit-price))")
if err != nil {
panic(err)
}
res := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
print(res)
}
This code panics with
panic: sum(/items/item/(quantity * unit-price)) has an invalid token
This should work:
package util
import (
"strings"
"testing"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
)
const doc = `
<html>
<body>
<div>
<form id="appForm" action="https://sp.example.com/auth/saml20" method="POST">
<input name="SAMLResponse" type="hidden" value="my-value"/>
</form>
</div>
</body>
</html>
`
func TestSaml(t *testing.T) {
samlPath, _ := xpath.Compile("//form[@id='appForm']/input[@name='SAMLResponse']/@value/text()")
htmlDoc, _ := htmlquery.Parse(strings.NewReader(doc))
samlValue := samlPath.Evaluate(htmlquery.CreateXPathNavigator(htmlDoc)).(string)
if samlValue != "my-value" {
t.Errorf("Parsed document incorrectly: %s", samlValue)
}
}
but it results in
=== RUN TestSaml
--- FAIL: TestSaml (0.00s)
panic: interface conversion: interface {} is *xpath.NodeIterator, not string [recovered]
panic: interface conversion: interface {} is *xpath.NodeIterator, not string
This works:
package util
import (
"strings"
"testing"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
)
const doc = `
<html>
<body>
<div>
<form id="appForm" action="https://sp.example.com/auth/saml20" method="POST">
<input name="SAMLResponse" type="hidden" value="my-value"/>
</form>
</div>
</body>
</html>
`
func TestSaml2(t *testing.T) {
samlPath, _ := xpath.Compile("//form[@id='appForm']/input[@name='SAMLResponse']/@value")
htmlDoc, _ := htmlquery.Parse(strings.NewReader(doc))
samlResult := htmlquery.QuerySelector(htmlDoc, samlPath)
samlValue := samlResult.FirstChild.Data
if samlValue != "my-value" {
t.Errorf("Parsed document incorrectly: %s", samlValue)
}
}
samlResult presents itself as a ElementNode (3) with a single child, a TextNode. So it is not clear to me why /text()
does not select that (and allows samlPath.Evaluate(...) to return a string) but returns a NodeNavigator.
This is a very vague question, feel free to close it.
I'd like to add variables to expressions, so that queries such as xpath.Compile(
$a + 4)
would work and evaluate to whatever is stored in $a
+ 4.
Do you have any suggestion where to add them? I assume that I need to pass some kind of context (with the defined variables) to Evaluate()
. IMO these should be handled as part of the queries or in a similar structure. Perhaps you have a different suggestion where to add these?
The issue occurred when take one of Expr
instance to multiple query nodes.
expr := xpath.MustCompile("//a")
for i := 0; i < 5; i++ {
doc, _ := htmlquery.LoadURL("http://bing.com")
iter := expr.Select(htmlquery.CreateXPathNavigator(doc))
var count = 0
for iter.MoveNext() {
count++
}
fmt.Println(count)
}
output:
34
0
0
0
0
Is there any timeline for supporting the Xpath not() condition?
//node()[not(self::script)]
I need to concat two fields from xml and return them combined
In the following document:
<?xml version="1.0"?>
<root>
<foo>
<bar attr="b"/>
</foo>
</root>
the xpath expression /foo/bar@attr
matches <bar/>
instead of attr="b"
Test case in #8 instantiated in antchfx/xquery#19
If we use the test condition below it will panic.
It panics because in this case substring
returns false (boolen)
, so when it try to compare false = ''
it fails.
If we remove the compare condition it works fine, but should substring
work as boolen condition?
Panic:
testXPath2(t, html, "//title[substring(child::*,0) = '']", 0)
Work:
testXPath2(t, html, "//title[substring(child::*,0)]", 0)
Usecase: http://xpather.com/Kp2lY28A
I am using a xpath to select all p
elements containing sup
elements between tables.
for i := 1; i < 10; i++ {
xp := fmt.Sprintf("//table[%v]/following-sibling::p[child::sup][count(.|//table[%v]/preceding-sibling::p[child::sup])=count(//table[%v]/preceding-sibling::p[child::sup])]", i, i+1, i+1)
expr, err := xpath.Compile(xp)
if err != nil {
logger.Log.Error(err)
return nil, "", err
}
iter := expr.Evaluate(htmlquery.CreateXPathNavigator(dc)).(*xpath.NodeIterator)
for iter.MoveNext() {
logger.Log.Debug(iter.Current().Value())
}
}
The final xpath is //table[1]/following-sibling::p[child::sup][count(.|//table[2]/preceding-sibling::p[child::sup])=count(//table[2]/preceding-sibling::p[child::sup])]
. When I test this xpath in the usecase it returns the correct values. Am I using a xpath which isn't supported or is it a bug somewhere in this library?
Using xmlquery, the following program outputs preceding-sibling::*[2] not found
, even though both the XPath expressions are correct, and should evaluate to node b
and c
respectively:
package main
import "strings"
import "github.com/antchfx/xmlquery"
func main() {
doc, _ := xmlquery.Parse(strings.NewReader(`<root>
<a>
<b id="b"></b>
<c id="c"></c>
<d id="d"></d>
</a>
</root>`))
if node := xmlquery.FindOne(doc, `//*[@id="d"]/preceding-sibling::*[2]`); node == nil {
println("preceding-sibling::*[2] not found")
}
if node := xmlquery.FindOne(doc, `//*[@id="d"]/preceding-sibling::*[1]`); node == nil {
println("preceding-sibling::*[1] not found")
}
}
Maybe related to #36?
What is your opinion about PR to create ends-with( string, string )
function?
If you don't mind I can do it.
Hi, thanks for your library. It works very well, except for a bug I found.
With the selector (//p//time//ancestor::p[1])[starts-with(normalize-space(.), 'Published on ')]
the CPU and memory go to the roof. There's likely an infinite loop somewhere. I've been trying to find a fix but couldn't manage.
Here's a file to reproduce the issue.
xpath.txt
I know it sounds odd at first but please hear me out.
I've been searching for an XML-2.0-compatible Go based XML manipulation tool/lib for a long long time, and I believe there is no such tool/lib exist. I can go on and on about the tools that I looked into before, but please do let me know if I'm wrong, that there is indeed such Go based XML manipulation tool/lib out there, which can satisfy the following simple requirements,
I saw that this xpath
package already has functions like addAttribute
, createChildNode
, so the first step making xpath into a dom manipulation tool is simple, just expose those ready-made functions as public, and adding the missing functionalities would be step 2.
What would you say?
See matches
spec at: https://www.w3.org/TR/xpath-functions-31/#func-matches
Note 1: https://www.w3.org/TR/xpath-functions-31/#flags shows all the supported flags, but will only implement flags
's'
, 'm'
, 'i'
and 'q'
. Will not include 'x'
due to lack of native support in https://golang.org/pkg/regexp/syntax/.
Note 2 / Question: for perf reason, we want to compile regexp and cache them. Thus we need a caching solution. Options:
sync.Map
and a simple helper function to load/compile regexp.sync.Map
.@zhengchun What do you think?
for example
<html>
<body>
<div>1_1</div>
<div>1_2</div>
<div>1_3</div>
<div>
<div>2_1</div>
<div>2_2</div>
</div>
<div>
<div>3_1</div>
<div>3_2</div>
</div>
</body>
</html>
doc, _ := htmlquery.Parse(strings.NewReader(html))
test2 := `//div[2]`
for _, n := range htmlquery.Find(doc, test2) {
fmt.Println(htmlquery.InnerText(n))
}
expected output 2 matched nodes: 1_2
and 2_2
but it output 1_1
.
For the following URL : https://victorious.fandom.com/wiki/Gallery:Victorious_Cast_in_Real_Life if you do a simple find like in code blow, the parser will take a few minute to get done:
nodes := htmlquery.Find( doc, # the html document "//div/@data-src | //link/@href | //img/@src | //img/@data-src | //img/@image-src | //div/@data-imgsrc | //object/@data | //a/@href | //span/@data-href | //img/@srcset | //source/@data-srcset | //li/@data-src | //source/@srcset")
Let me know if a pprof will help or if you need anything else from me.
Thank you very much for the work on this library, otherwise it works really great at scale.
This is more of a question/enhancement than an issue.
Is there a way to load just subset of the xml tree and not the whole xml file?
I need to parse large xml files and not interested in all the data from those files but just data from specific nodes.
I was thinking that instead of loading the entire XML and then xpath it, my code will traverse (using standard xml GetToken() method) to an interesting node and ill load just that code to your library and query just it..
Thanks
Versions:
This test is failing:
func TestXPathAnd(t *testing.T) {
body := `
<top>
<child><name>Child1</name>
<toy>A</toy>
<toy>B</toy>
</child>
</top>`
root, err := xmlquery.Parse(strings.NewReader(body))
assert.NoError(t, err)
expr, err := xpath.Compile(`/top/child[toy[text() = "A"] and toy[text()="B"]]/name/text()`)
assert.NoError(t, err)
result := expr.Evaluate(xmlquery.CreateXPathNavigator(root))
switch rt := result.(type) {
case *xpath.NodeIterator:
assert.True(t, rt.MoveNext()). // Fails
fmt.Println("Value:", rt.Current().Value())
}
}
Compare this with standard macos xpath utility:
xpath -e '/top/child[toy[text() = "A"] and toy[text()="B"]]/name/text()'
<top>
<child><name>Child1</name>
<toy>A</toy>
<toy>B</toy>
</child>
</top>
Found 1 nodes in stdin:
-- NODE --
Child1
Either xpath.MustCompile needs to fail, or xpath.Expr.Select needs to not crash.
$ head -999 go.mod main.go
==> go.mod <==
module example.com/m
go 1.13
require (
github.com/antchfx/htmlquery v1.1.0
github.com/antchfx/xpath v1.1.0
golang.org/x/net v0.0.0-20191101175033-0deb6923b6d9
)
==> main.go <==
package main
import (
"log"
"strings"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
)
func main() {
const doc = ``
tree, err := html.Parse(strings.NewReader(doc))
if err != nil {
log.Fatalf("html parse: %v", err)
}
exp := xpath.MustCompile(`//`)
_ = htmlquery.QuerySelector(tree, exp)
}
$ go run .
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0x6950c1]
goroutine 1 [running]:
github.com/antchfx/xpath.(*Expr).Select(...)
github.com/antchfx/[email protected]/xpath.go:130
github.com/antchfx/htmlquery.QuerySelector(0xc0000ca230, 0x0, 0x0)
github.com/antchfx/[email protected]/query.go:70 +0x61
main.main()
example.com/m@/main.go:19 +0x10a
exit status 2
$
Could you add support for exists() func?
I would like to create a condition usingexists
, something like this:
/SpliceInfoSection/SegmentationDescriptor[not(exists(DeliveryRestrictions))]
Hey,
when running this code:
package main
import (
"fmt"
"strings"
"github.com/antchfx/xpath"
"github.com/antchfx/xmlquery"
)
func main(){
xml := `
<?xml version="1.0"?>
<Indices>
<Index>
<Name>1</Name>
<Data>
<Value>0</Value>
<Value>1</Value>
<Value>2</Value>
<Value>3</Value>
<Value>4</Value>
<Value>5</Value>
<Value>6</Value>
</Data>
</Index>
</Indices>
`
// Parse the XML
doc, err := xmlquery.Parse(strings.NewReader(xml))
if err != nil {
panic(err)
}
// Queries
selected, err := xmlquery.QueryAll(doc, "//Data/child::Value")
if err != nil {
panic(err)
}
if len(selected) > 0 && selected[0] != nil {
for i, context := range selected {
expr, err := xpath.Compile("position()")
if err != nil {
panic(err)
}
r := expr.Evaluate(xmlquery.CreateXPathNavigator(context))
fmt.Printf("[%02d] result: %v (%T)\n", i, r, r)
}
}
}
I get the following output
[00] result: 2 (float64)
[01] result: 4 (float64)
[02] result: 6 (float64)
[03] result: 8 (float64)
[04] result: 10 (float64)
[05] result: 12 (float64)
[06] result: 14 (float64)
where the expected position output is 1, 2, ..., 7...
What am I doing wrong here?
My xml data is -
<root> <measEntity localDn="VNFID=bangloreiit-5005cucp1; gNBID=5005; gNBName=;" swVersion="6.0.00.49.6"/> <measData> <measInfo measInfoId="OR.end"> <granPeriod duration="PT300S" endTime="2021-07-02T17:10:00+00:00"/> <measTypes>SgnbAddAttemptCell.enb.0 SgnbAddAttemptCell.enb.1 SgnbAddAttemptCell.enb.2 SgnbAddAttemptCell.enb.3</measTypes> <measResults>0 0 0 0</measResults> </measInfo> </measData> </root>
I wanted to parse it using telegraf. I am able to get measurement, but currently i am facing 2 challenges:
Tag name - measEntity will provide tag name; eg - tag VNFID with value bangaloreiit-5005cucp1 and so on. But i am not able to get the tags.
Field name - Using measTypes i want to get the fieldnames which are separated by space- eg - SgnbAddAttemptCell.enb.0, SgnbAddAttemptCell.enb.1 and so on and their respective values inside measResults portion. I tried using tokenize but it was giving error in telegraf. Any way to do that using other predefined functions?
Am I right in thinking that the XPath 2.0 functions tokenize() and string-join() are not supported (yet)? I like to use the tokenize function to extract items from XML attributes that can hold several space-separated items with functions like this:
tokenize(tokenize(./persName, 'orcid:')[2], ' ')[1]
Is there any policy about XPath 1.0 vs XPath 2.0 (or even 3.1) support? I could not find any beyond the list of supported functions.
Hi,
Been using the library for my exams, and I love it very much. 👍
However, I've encountered an issue where I've set a predicate based on an e.g. id
, and it don't result in a result. But it works just fine using xmllint --xpath
.
/StudentSubmissions/Department/Students/Student[@studentNo = /StudentSubmissions/Department/Courses/Course/Assignments[@studyYear = '2017']/Assignment[@no = '1' and not(@no = '6')]/Submission/@student]
<Student studentNo="1">
<FamilyName>Student</FamilyName>
<GivenName>Another</GivenName>
</Student>
package main
import (
"os"
"github.com/antchfx/xmlquery"
)
func main() {
f, err := os.Open("###FILE###")
if err != nil {
panic(err)
}
doc, err := xmlquery.Parse(f)
if err != nil {
panic(err)
}
for _, n := range xmlquery.Find(doc, "/StudentSubmissions/Department/Students/Student[@studentNo = /StudentSubmissions/Department/Courses/Course/Assignments[@studyYear = '2017']/Assignment[@no = '1' and not(@no = '6')]/Submission/@student]") {
fmt.Printf("%s\n", n.OutputXML(true))
}
}
<?xml version="1.0"?>
<!DOCTYPE StudentSubmissions SYSTEM "StudentSubmissions.dtd">
<StudentSubmissions>
<Department shortName="IDI">
<Name>Computer Science</Name>
<Students cycle="B">
<Student studentNo="1">
<FamilyName>Student</FamilyName>
<GivenName>Another</GivenName>
</Student>
</Students>
<Courses cycle="B">
<Course>
<Code>IMT2571</Code>
<Title>Data Modelling and Database Systems</Title>
<Assignments studyYear="2017" min="5" max="6">
<Assignment no="1">
<Title>Database Application</Title>
<Description>Develop and test database application.</Description>
<Deadline>2017-09-10</Deadline>
<Submission student="1" date="2017-09-09">
<Outcome>1</Outcome>
<Comments>Well done!</Comments>
</Submission>
</Assignment>
<Assignment no="2">
<Title>SQL</Title>
<Description>Insert, query & modify database data.</Description>
<Deadline>2017-09-24</Deadline>
<Submission student="1" date="2017-10-02"/>
</Assignment>
<Assignment no="3">
<Title>Conceptual Modelling</Title>
<Description>Designing a conceptual (UML) model.</Description>
<Deadline>2017-10-08</Deadline>
</Assignment>
</Assignments>
</Course>
<Course>
<Code>IMT2261</Code>
<Title>Information Structures and Database Systems</Title>
<Assignments studyYear="2017" min="6" max="7"> </Assignments>
</Course>
</Courses>
<Courses cycle="M">
<Course>
<Code>IMT4886 Applied Computer Science Project</Code>
<Title/>
</Course>
</Courses>
</Department>
</StudentSubmissions>
Given the following document:
<root>
<para> text <img/> text </para>
</root>
Given the context node selected by //para
When selecting child nodes using ./child::node()
Then, only the <img/>
node is selected instead of all three nodes including the two text nodes. I believe this is a similar issue to #11: the default predicate for all queries is to select only element nodes.
I have the following example document:
<?xml version="1.0"?>
<root>
<foo>
<bar a="b"/>
</foo>
<li>
<a class="tag" href=""/>
</li>
<p class='meta'>
</p>
</root>
with the following xpath expression: //root/p[@class='meta']
The expression does not find the <p/>
node while the XPath tester can.
I was testing the translate function by running the following xpath expression: //character[number(translate(./born,'-','')) > 19600000]/name
It causes a panic because the asString function which translateFunc calls can't handle a *xpath.childQuery type as value.
I made this example to demonstrate the issue:
package main
import (
"fmt"
"strings"
"github.com/antchfx/xmlquery"
"github.com/antchfx/xpath"
)
var xml = `
<library>
<!-- Great book. -->
<book id="b0836217462" available="true">
<isbn>0836217462</isbn>
<title lang="en">Being a Dog Is a Full-Time Job</title>
<quote>I'd dog paddle the deepest ocean.</quote>
<author id="CMS">
<?echo "go rocks"?>
<name>Charles M Schulz</name>
<born>1922-11-26</born>
<dead>2000-02-12</dead>
</author>
<character id="PP">
<name>Peppermint Patty</name>
<born>1966-08-22</born>
<qualification>bold, brash and tomboyish</qualification>
</character>
<character id="Snoopy">
<name>Snoopy</name>
<born>1950-10-04</born>
<qualification>extroverted beagle</qualification>
</character>
</book>
</library>`
func main() {
reader := strings.NewReader(xml)
doc, err := xmlquery.Parse(reader)
if err != nil {
panic(err)
}
query := "//character[number(translate(./born,'-','')) > 19600000]/name"
expr, err := xpath.Compile(query)
if err != nil {
panic(err)
}
nav := xmlquery.CreateXPathNavigator(doc)
iter := expr.Select(nav)
for iter.MoveNext() {
fmt.Println(iter.Current().Value())
}
}
The following code makes the library crash:
query := `a | h2`
expr := xpath.MustCompile(query)
expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(*xpath.NodeIterator)
The output is:
panic: interface conversion: interface {} is *xpath.childQuery, not bool
goroutine 1 [running]:
github.com/antchfx/xpath.(*booleanQuery).Evaluate(0xc42007bad0, 0x1405b00, 0xc4200da560, 0xc4200da520, 0xc42007edd0)
.../go/src/github.com/antchfx/xpath/query.go:710 +0xe4
github.com/antchfx/xpath.(*Expr).Evaluate(0xc4200da520, 0x140b6c0, 0xc4200da540, 0x0, 0x0)
.../go/src/github.com/antchfx/xpath/xpath.go:120 +0x8f
main.main()
.../go/src/github.com/vovchynniko/htmlquery_test/main.go:24 +0x132
The document can be any valid xml/html.
Hi,
Our application is using the following npm command on Windows 10 machine using eith cf6 or cf7
"deploy": "cf deploy mta_archives/*.mtar"
However, the same command works fine on Business Application Studio and Mac OS Big Sur.
Updating the command to "deploy": "cf deploy mta_archives/<filename-mta>.mtar"
works fine. Our business case, is other applications can generate an mta specifing a different filenamee so just want to deploy whatever version is found in the mta_archives folder.
panic: runtime error: invalid memory address or nil pointer dereference
[signal 0xc0000005 code=0x0 addr=0x18 pc=0x1195638]
goroutine 1 [running]:
github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands.getMtaArchive(0xc00004ad20, 0x1, 0x1, 0x101, 0xc000017208, 0xc0004ff4d0, 0xe93076, 0xc00007ef00)
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands/deploy_command.go:354 +0x218
github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands.(*DeployCommand).executeInternal(0xc00059c700, 0xc00004ad20, 0x1, 0x1, 0xc000498270, 0x2b, 0xc00029eba0, 0xc000499ce0, 0x24, 0xc00068e940, ...)
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands/deploy_command.go:183 +0x290
github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands.(*BaseCommand).Execute(0xc00051e3f0, 0xc0000280b0, 0x1, 0x1, 0x0)
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/commands/base_command.go:132 +0x5b5
main.(*MultiappsPlugin).Run(0x16d8ad0, 0x13524c0, 0xc00050fb00, 0xc0000280a0, 0x2, 0x2)
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/multiapps_plugin.go:45 +0x1c2
github.com/cloudfoundry-incubator/multiapps-cli-plugin/vendor/github.com/cloudfoundry/cli/plugin.Start(0x1346340, 0x16d8ad0)
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/vendor/github.com/cloudfoundry/cli/plugin/plugin_shim.go:30 +0x1eb
main.main()
/root/go/src/github.com/cloudfoundry-incubator/multiapps-cli-plugin/multiapps_plugin.go:65 +0x45
Any help is appreciated on this matter?
Thanks.
John
I'm using this library to batch parse a large number of HTML documents in parallel (via antchfx/htmlquery
) and it will always result either in a nil pointer dereference or wrong result.
The issue is reuse of query objects due to shallow cloning of functionQuery
objects.
htmlquery
calls xpath.Expr.Select
, which creates a new node iterator containing a cloned query.functionQuery
is cloned but the clone is shallow, because the function is actually a closure that captured references to the original query objects when it was constructed.Select
is called (for example attributeQuery.iterator
in attributeQuery.Select
).Here is a self-contained example with htmlquery
, which uses a global cache of xpath.Expr
objects, that hits the problem fairly quickly on my machine.
package main
import (
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"log"
"strings"
)
func work() {
doc, err := html.Parse(strings.NewReader(`
<html>
<head><title>Hello</title></head>
<body>
<div class="c1 c2">A</div>
<div class="c2 c3">B</div>
<div class="c3 c1">C</div>
</body>
</html>
`))
if err != nil {
log.Fatal(err)
}
ns := htmlquery.Find(doc, `//div[contains(@class,"c1")]/text()`)
if len(ns) == 0 {
log.Fatal("text not found")
}
elems := []string{}
for _, n := range ns {
elems = append(elems, htmlquery.InnerText(n))
}
log.Printf(`%d elems: %v`, len(elems), elems)
}
func main() {
for i := 0; i < 4; i++ {
go func() {
for {
work()
}
}()
}
select {}
}
Note that it will run correctly if you remove the contains
clause, and just use =
, for example.
My current workaround is to disable query cache in htmlquery
, but what really needs to be done is to refactor functionQuery
so it can be cloned correctly.
xpath规则: descendant::div[3]/b 无法得到结果。
I'm wondering why the current()
function hasn't been implemented, and if it would make much time to do so.
I'm currently in the situation where I would need a query like //cd[@title=current()/@ref]
. The node navigator used for the expression evaluate isn't the document root, but a previously selected node.
一直没明白如何输出://img/@src 这样的xpath规则的结果。
我期望的是输出元素img的属性src的值,请帮助我,谢谢!
string-length()
includes the length of all whitespace unless it is used with normalize-space()
. Used together you can find content-rich areas:
//p[string-length(normalize-space(self::text())) > 200]
Can we have a release (or pre-release, say) with a version so people can keep track of the changes after a particular point they integrated in their app.
Thanks in advance.
Input xml:
<root>
<a>
<b>10</b>
<c>ten</b>
</a>
<a>
<b>20</b>
<c>twenty</b>
</a>
</root>
Desired XPATH:
/root/a[b = ('10', '15')]/c
But right now have to:
/root/a[b = '10' or b = '15']/c
Or am I missing something?
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.