Coder Social home page Coder Social logo

hydralix's People

Contributors

hexrabbit avatar rainrainwu avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar

Watchers

 avatar

hydralix's Issues

[REFACTOR] Similar functions

Merge function with a bool parameter to switch different behavior.

hydralix/hydralix.go

Lines 107 to 119 in 963b8b5

func CommandRecur(match string, collect string, callback cmd_callback) recur_cmd {
compiled_match, err := regexp.Compile(match)
if (err != nil) {
panic("Regexp compilation failed")
}
compiled_collect, err := regexp.Compile(collect)
if (err != nil) {
panic("Regexp compilation failed")
}
return recur_cmd{compiled_match, compiled_collect, callback, false}
}

hydralix/hydralix.go

Lines 130 to 142 in 963b8b5

func CommandRecurFlat(match string, collect string, callback cmd_callback) recur_cmd {
compiled_match, err := regexp.Compile(match)
if (err != nil) {
panic("Regexp compilation failed")
}
compiled_collect, err := regexp.Compile(collect)
if (err != nil) {
panic("Regexp compilation failed")
}
return recur_cmd{compiled_match, compiled_collect, callback, true}
}

[STYLE] Import description

Divide import packages into three parts with a blank line interval :

[Standard packages]

[Third-party packages]

[Self-define packages]

[STYLE] Uniform comments

Uniform the comments for all function type object declarations.

hydralix/hydralix.go

Lines 21 to 38 in 963b8b5

// add some error handler
/* cmd_callback
* @param {string} crawled url
* @param {string} matched group
* @return {string} processed data
*/
type cmd_callback func(url string, group []string) (string, string)
/* cmd_filter
* @param {int} index of data (of the same regex match)
* @param {string} cmd_callback processed data
* @return {bool} choose or not
*/
type cmd_filter func(int, string) bool
type output_async_callback func(string, *sync.Mutex)
type output_sync_callback func([]string)
type output_async_callback_int func(string, *sync.Mutex, *sync.WaitGroup)

[REFACTOR] Similar functions

Merge function with a bool parameter to switch different behavior.

hydralix/hydralix.go

Lines 98 to 105 in 963b8b5

func Command(regex string, matchTimes int, filter cmd_filter, callback cmd_callback) crawl_cmd {
compiled, err := regexp.Compile(regex)
if (err != nil) {
panic("Regexp compilation failed")
}
return crawl_cmd{compiled, matchTimes, filter, callback, false}
}

hydralix/hydralix.go

Lines 121 to 128 in 963b8b5

func CommandFlat(regex string, matchTimes int, filter cmd_filter, callback cmd_callback) crawl_cmd {
compiled, err := regexp.Compile(regex)
if (err != nil) {
panic("Regexp compilation failed")
}
return crawl_cmd{compiled, matchTimes, filter, callback, true}
}

[STYLE] Complicated functions

Export the operations within case clauses to a new independent function

hydralix/hydralix.go

Lines 180 to 336 in 963b8b5

switch cmd := hydra.commandLayer[layer].(type) {
case crawl_cmd:
ch := make(chan job, 8)
for _, url := range targetURLs {
go hydra.request(url, ch)
fmt.Println("[Request]", url)
}
for i := 0; i < len(targetURLs); i++ {
res := <-ch
re := cmd.regex
matched := re.FindAllStringSubmatch(res.data, cmd.matchTimes)
for matched_idx, lst := range matched {
/* tag is used for build tree and name the folder
should be different from each other
or the mapping may failed
*/
var processed, tag string
/* post-process */
if cmd.callback != nil {
processed, tag = cmd.callback(res.url, lst[1:])
} else {
/* no callback: forward first matched group */
processed = lst[1]
hasher := md5.New()
hasher.Write([]byte(lst[1]))
tag = hex.EncodeToString(hasher.Sum(nil))
}
/* drop duplicated urls to avoid conflict */
if _, exist := hydra.tagNodeTable[processed]; exist {
fmt.Println("[Warning] Fetched duplicated url: %s", processed)
continue
}
/* filtering */
if cmd.filter != nil && cmd.filter(matched_idx, processed) {
crawled = append(crawled, processed)
} else if cmd.filter == nil {
crawled = append(crawled, processed)
}
/* build tree with tag */
/* by use of
- res.url:
should be used to search the parent tag node
- processed:
used to build map[processed] = tag, point tag node to parent tag node
maybe change map[url(processed)]nodePtr -> map[tag]nodePtr ?
: but you don't know the tag of download link in the outputCallback function
*/
parentPtr := hydra.tagNodeTable[res.url]
var nodePtr *tagTreeNode
if cmd.flat {
nodePtr = parentPtr
} else {
nodePtr = &tagTreeNode{
tag: tag,
parent: parentPtr,
}
}
hydra.tagNodeTable[processed] = nodePtr
}
}
targetURLs = targetURLs[:0]
targetURLs = append(targetURLs, crawled...)
crawled = crawled[:0]
case recur_cmd:
var collection []string
ch := make(chan job, 8)
for len(targetURLs) > 0 {
for _, url := range targetURLs {
go hydra.request(url, ch)
fmt.Println("[Request]", url)
}
for i := 0; i < len(targetURLs); i++ {
res := <-ch
match := cmd.match
matched := match.FindAllStringSubmatch(res.data, -1)
collect := cmd.collect
collected := collect.FindAllStringSubmatch(res.data, -1)
parentPtr := hydra.tagNodeTable[res.url]
for _, lst := range collected {
hydra.tagNodeTable[lst[1]] = parentPtr
collection = append(collection, lst[1])
}
for _, lst := range matched {
/* tag is used for build tree and name the folder
should be different from each other
or the mapping may failed
*/
var processed, tag string
/* post-process */
if cmd.callback != nil {
processed, tag = cmd.callback(res.url, lst[1:])
} else {
/* no callback: forward first matched group */
processed = lst[1]
hasher := md5.New()
hasher.Write([]byte(lst[1]))
tag = hex.EncodeToString(hasher.Sum(nil))
}
/* drop duplicated urls to avoid conflict */
if _, exist := hydra.tagNodeTable[processed]; exist {
fmt.Println("[Warning] Fetched duplicated url: %s", processed)
continue
}
crawled = append(crawled, processed)
var nodePtr *tagTreeNode
if cmd.flat {
nodePtr = parentPtr
} else {
nodePtr = &tagTreeNode{
tag: tag,
parent: parentPtr,
}
}
hydra.tagNodeTable[processed] = nodePtr
}
}
targetURLs = targetURLs[:0]
targetURLs = append(targetURLs, crawled...)
crawled = crawled[:0]
}
targetURLs = append(targetURLs, collection...)
}
}

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.