Please see my problem, believe me it is easy to solve

i tried to implement async and await inside spawn child process. But it didn’t worked. Please see this

Expected output

 *************
http://www.stevecostellolaw.com/
 *************
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
 *************
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
 *************
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
 *************
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
 *************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/

 *************

Becoz each time spawn child found await it will go back to python script and print ************* it and then print URL. Ignore 2 times printing of same url here.

Output which i m getting

C:UsersASUSDesktopsearchermc>node app.js
server running on port 3000

DevTools listening on ws://127.0.0.1:52966/devtools/browser/933c20c7-e295-4d84-a4b8-eeb5888ecbbf
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.188] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.189] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)

 *************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/

 *************

Please see the app.js code below

// form submit request
app.post('/formsubmit', function(req, res){

    csvData = req.files.csvfile.data.toString('utf8');
    filteredArray = cleanArray(csvData.split(/r?n/))
    csvData = get_array_string(filteredArray)
    csvData = csvData.trim()
    
    var keywords = req.body.keywords
    keywords = keywords.trim()

    // Send request to python script
    var spawn = require('child_process').spawn;
    var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])

    var outarr = []

    // process.stdout.on('data', (data) => {
    //   console.log(`stdout: ${data}`);
    // });

    process.stdout.on('data', async function(data){

      console.log("n ************* ")
      console.log(data.toString().trim())
      await outarr.push(data.toString().trim())
      console.log("n ************* ")

    });

});

Python function which is sending in the URLs when the if condition matched

# Function for searching keyword start
def search_keyword(href, search_key):
    extension_list = ['mp3', 'jpg', 'exe', 'jpeg', 'png', 'pdf', 'vcf']
    if(href.split('.')[-1] not in extension_list):
        try:    
            content = selenium_calling(href)
            soup = BeautifulSoup(content,'html.parser')
            search_string = re.sub("s+"," ", soup.body.text)
            search_string = search_string.lower()
            res = [ele for ele in search_key if(ele.lower() in search_string)]
            outstr = getstring(res)
            outstr = outstr.lstrip(", ")
            if(len(res) > 0):
                print(href)
                found_results.append(href)
                href_key_dict[href] = outstr
                return 1
            else:
                notfound_results.append(href)
        except Exception as err:
            pass

I want to do all this because of the python script which takes more time to execute and thus give timeout error each time, so i am thinking to get intermediate ouput of the python script in my nodejs script. you can see the error i m getting in below image.

enter image description here

Answer

I’m not sure I completely understand what you’re trying to do, but I’ll give it a shot since you seem to have asked this question many times already (which usually isn’t a good idea). I believe that there’s a lack of clarity in your question – it would help a lot if you could clarify what your end goal is (i.e. how do you want this to behave?)

I think you mentioned two separate problems here. The first is that you expect a new line of ‘******’ to be placed before each separate piece of data returned from your script. This is something that can’t be relied on – check out the answer to this question for more detail: Order of process.stdout.on( ‘data’, … ) and process.stderr.on( ‘data’, … ). The data will be passed to your stdout handler in chunks, not line-by-line, and any amount of data can be provided at a time depending how much is currently in the pipe.

The part I’m most confused about is your phrasing of “to get intermediate ouput of the python script in my nodejs script”. There’s not necessarily any “immediate” data – you can’t rely on data coming in at any particular time with your process’s stdout handler, its going to hand you data at a pace determined by the Python script itself and the process its running in. With that said, it sounds like your main problem here is the timeout happening on your POST. You aren’t ever ending your request – that’s why you’re getting a timeout. I’m going to assume that you want to wait for the first chunk of data – regardless of how many lines it contains – before sending a response back. In that case, you’ll need to add res.send, like this:

    // form submit request
app.post('/formsubmit', function(req, res){

    csvData = req.files.csvfile.data.toString('utf8');
    filteredArray = cleanArray(csvData.split(/r?n/))
    csvData = get_array_string(filteredArray)
    csvData = csvData.trim()
    
    var keywords = req.body.keywords
    keywords = keywords.trim()

    // Send request to python script
    var spawn = require('child_process').spawn;
    var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])

    var outarr = []

    // process.stdout.on('data', (data) => {
    //   console.log(`stdout: ${data}`);
    // });
    
    // Keep track of whether we've already ended the request
    let responseSent = false;

    process.stdout.on('data', async function(data){

        console.log("n ************* ")
        console.log(data.toString().trim())
        outarr.push(data.toString().trim())
        console.log("n ************* ")
        
        // If the request hasn't already been ended, send back the current output from the script
        // and end the request
        if (!responseSent) {
            responseSent = true;
            res.send(outarr);
        }
    });

});